Here, I analyse the transposable elements (TE) of a world-wide whole genome sampling of Zymoseptoria tritici, as well as the defence mechanism against TE that is RIP.

library(knitr)
library(reticulate)

#Data wrangling and data viz
library(tidyverse)
library(purrr)
library(plotly)
library(cowplot)
library(GGally)
library(ggrepel)
library(pheatmap)
library(ggridges)
library(ggpubr)
library(tidytree)
#library(ggtree)

#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)

library(sp)
library(raster)

#Variables
world <- map_data("world")
project_dir="~/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "~/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"

#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "~/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"

#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
  depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
  depth_per_gene_dir = paste0(VAR_dir, "2_Depth_per_gene/")
  vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
  mito_SV = paste0(VAR_dir, "6_Mito_SV/")
  chipseq_dir = paste0(VAR_dir, "7_ChipSeq_peaks/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
  nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
  mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
   RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
   DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
  gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")


#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
effectors_annotation_file = "~/Documents/Postdoc_Eva/Manuscripts/Accepted/Alice_Cecile_Comparative_genomics/Data_for_publication/Annotations_2018_genomes_for_publication.tab"
eggnog_annotation = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.eggnog")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
#metadata_name = "Main_table_from_SQL_Feb_2020"
metadata_name = "Main_table_from_SQL_June_2022_2"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))

Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)
Sys.setenv(TERIP=TE_RIP_dir)

Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)

Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)

#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)
knitr::opts_chunk$set(dev=c('png', 'pdf'))


# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
  read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
  mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
  mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
  mutate(Filter = "Mutants_etc"))

##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")

## Metadata of genotyped samples 
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()

Zt_meta = read_delim(paste0(metadata_dir, metadata_name, "_with_collection.tab"), 
                 col_names = temp, delim = "\t",
                 na = "\\N", guess_max = 2000) %>%
  unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
  inner_join(., genotyped_samples)  %>%
  mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
  mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
  mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
  mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
  mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2)) %>%
  dplyr::select(ID_file, Continent, Country, Latitude, Longitude, Latitude2, Longitude2,
                Sampling_Date, Collection)

temp = read_tsv(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.tsv")) %>%
  dplyr::select(ID_file = Sample, Cluster)

Zt_meta = full_join(Zt_meta, temp)

#genotyped_samples %>%
#  filter(!(ID_file %in% filtered_samples$ID_file)) %>%
#    write_tsv(Zt_list, col_names = F)



#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)


#For clusters
## Simplified color scheme
myColors_clust <- c("#129eba", "#3F6E0C", "#DA4167", "#ffba0a", "#129eba", "#129eba", 
               "#8fa253", "#A20106", "#A20106", "#3F6E0C", "#8fa253")
names(myColors_clust) = levels(factor(Zt_meta$Cluster))
Color_Cluster = ggplot2::scale_colour_manual(name = "Cluster", values = myColors_clust)
Fill_Cluster = ggplot2::scale_fill_manual(name = "Cluster", values = myColors_clust)

## Detailed color scheme
K_colors = c("#0D6E82", #V1 Aus (TAS)
             "#49810E", #V10 USA
             "#E16684", #V11 North Africa
             "#FFBA0A", #V2 Europe
             "#C7F1F9", #V3 NZ
             "#21C7E8", #V4 Australia (NSW)
             "#8FA253", #V5 Uruguay + Argentina
             "#650104", #V6 Israel + Turkey
             "#DE020A", #V7 Iran
             "#2A4908", #V8 Canada
             "#B3C186") #V9 Boliva + Chile + Ecuador
names(K_colors) = levels(factor(Zt_meta$Cluster))
Color_Cluster2 = ggplot2::scale_colour_manual(name = "Cluster", values = K_colors)
Fill_Cluster2 = ggplot2::scale_fill_manual(name = "Cluster", values = K_colors)

# 
K_colors2 = c("grey", 
             "#8ECAE6", #V1 Aus (TAS)
             "#49810E", #V10 USA
             "#E16684", #V11 North Africa
             "#FFBA0A", #V2 Europe
             "#126782", #V3 NZ
             "#219EBC", #V4 Australia (NSW)
             "#8FA253", #V5 Uruguay + Argentina
             "#650104", #V6 Israel + Turkey
             "#DE020A", #V7 Iran
             "#2A4908", #V8 Canada
             "#B3C186") #V9 Boliva + Chile + Ecuador
myShapes2 <- c(1, 1, 1, 1, 1, 2, 0, 
               1, 1, 0, 0, 0)
names(K_colors2) = levels(factor(ifelse(is.na(Zt_meta$Cluster), "Hybrid", Zt_meta$Cluster)))
Color_Cluster3 = ggplot2::scale_colour_manual(name = "Cluster", values = K_colors2)
Fill_Cluster3 = ggplot2::scale_fill_manual(name = "Cluster", values = K_colors2)
Shape_Cluster2 = ggplot2::scale_shape_manual(name = "Cluster", values = myShapes2)


## Shapes for clusters
myShapes <- c(1, 1, 1, 1, 2, 0, 1, 1, 0, 0, 0)
names(myShapes) = levels(factor(Zt_meta$Cluster))
Shape_Cluster = ggplot2::scale_shape_manual(name = "Cluster", values = myShapes)




## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)


Bioclim_var = c("Annual Mean Temperature", "Mean Diurnal Range ",
                "Isothermality (BIO2/BIO7) (×100)", "Temperature Seasonality (standard deviation ×100)",
                "Max Temperature of Warmest Month", "Min Temperature of Coldest Month",
                "Temperature Annual Range (BIO5-BIO6)",
                "Mean Temperature of Wettest Quarter","Mean Temperature of Driest Quarter",
                "Mean Temperature of Warmest Quarter", "Mean Temperature of Coldest Quarter",
                "Annual Precipitation", "Precipitation of Wettest Month",
                "Precipitation of Driest Month", "Precipitation Seasonality (Coefficient of Variation)",
                "Precipitation of Wettest Quarter", "Precipitation of Driest Quarter",
                "Precipitation of Warmest Quarter","Precipitation of Coldest Quarter")
#Run on the cluster

#Create bed file with 10kb windows
#(including the last window which can be smaller)
while read chr length temp temp2 temp3; do 
  start=0; 
  while [ "$start" -le "$length" ] ; do 
    if [ "$(($start + 10000))" -le  "$length" ] ; 
    then 
      echo -e "${chr}\t${start}\t$(($start + 10000))" ; 
    else  echo -e "${chr}\t${start}\t$length" ;  
    fi ; 
    start=`expr $start + 10000` ; 
  done ; 
done < /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa.fai > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed


#GC etc from bedtools nuc
~/Software/bedtools nuc \
    -fi /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
    -bed /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
    > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab


#RIP per window 
#(my script makes a summary for all seq in a multifasta, so I tricked it my giving it a single window at a time)
 #columns are "CHROM", "Start", "End", "GC", "Product index", "Substrate index", "Composite"
 while read chr start end ; 
    do 
    echo -e "${chr}\t${start}\t${end}" > temp.bed ; 
    ~/Software/bedtools getfasta -fi /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
       -bed temp.bed -fo temp.fa ; 
    python GC_RIP_per_read_fastq.py --input_format fasta --out temp temp.fa ; 
    values=$(cat temp.txt | ~/Software/datamash-1.3/datamash transpose | grep "Median" | cut -f 2,3,4,5) ; 
    echo -e "${chr}\t${start}\t${end}\t$values" \
    >> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv ; 
done < /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed


#Count variants
#columns are CHROM, Start(IN 10KB UNITS), Variant_number
zcat /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz  | \
   grep -v "#"  | awk 'BEGIN {FS="\t"; OFS="\t"} {print $1,$2,int($2/10000)}' | \
   ~/Software/bedtools groupby -g 1,3 -o count -c 2 > \
   /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab
   
   
#Gene coverage
#columns are CHROM, Start, End, Nb_overlapping_genes, Coverage_bp_gene, Window_length, Coverage_frac_gene
~/Software/bedtools coverage \
   -a /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
   -b /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.mRNA.gff3 \
   > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab
 
 #Reference TE coverage
#columns are CHROM, Start, End, Nb_overlapping_TEs, Coverage_bp_TE, Window_length, Coverage_frac_TE
 ~/Software/bedtools coverage \
    -a /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
    -b /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.Badet_Oggenfuss_2019.TE.gtf \
    > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab

Previously, based on the study of TE and RIP in fully-assembled Z.tritici genomes, a hypothesis was drawn. The lower RIP in TEs of European samples, as compared to Iranian isolates, could indicate a loss of RIP in Z.tritici when it spread out of its area of origin. Here, I would like to investigate this possibility in the different pop.

Transposable elements: distribution, population structure, and genomic architecture

The data plotted here are based on the following steps:

  • Detect the TE insertions using ngs_TE_mapper version 2.
  • Map the reads on TE consensus (from Lorrain et al. 2020, so it includes only Iranian and European samples) and create two bins: reads mapping on TEs and reads that are not mapping.
  • Measuring the RIP composite index for reads that mapped on TE.
  • Estimating the index median in TE reads per isolate.

Transposable elements insertion polymorphisms: estimation, biases, and frequency

I detected reference and non-reference TE insertions with ngs_te_mapper2. I investigate the possible biases between collections.

#Summarize numbers
#_________________
#grep "Number" /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*/ngs_te_mapper.log | grep "non-reference" -v | cut -d ":" -f 1,7 | sed 's|/| |g' | cut -d " " -f 7,9 | sort > /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/Ref_TE_numbers.txt
#grep "Number" /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*/ngs_te_mapper.log | grep "non-reference" | cut -d ":" -f 1,7 | sed 's|/| |g' | cut -d " " -f 7,9 | sort > /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/Non-ref_TE_numbers.txt
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*_TE_numbers.txt ../4_TE_RIP/



#Gather per strain files into one big file
#_________________________________________
#cd /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper
#for direc in $dir_list ; do if [ -f ./${direc}${direc%/}_1_paired.nonref.bed ] ; then awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.nonref.bed ; else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.nonref.bed ; fi  ; done > Non-ref_all_strains.bed
#for direc in $dir_list ; do if [ -f ./${direc}${direc%/}_1_paired.ref.bed ] ; then awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.ref.bed ; else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.ref.bed ; fi  ; done > Ref_all_strains.bed
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*ef_all_strains.bed ../4_TE_RIP/


#ls -d -1 */ > directory_list.txt
for direc in $dir_list ; 
do 
  if [ -f ./${direc}${direc%/}_1_paired.nonref.bed ] ; 
  then 
      awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.nonref.bed ; 
  else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.nonref.bed ;
  fi  ; 
done > Non-ref_all_strains_2022.bed

for direc in $dir_list ; 
do 
   if [ -f ./${direc}${direc%/}_1_paired.ref.bed ] ; 
   then 
       awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.ref.bed ; 
   else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.ref.bed ; 
   fi  ; 
done > Ref_all_strains_2022.bed
TE_qty = full_join(read_delim(paste0(TE_RIP_dir, "Non-ref_TE_numbers_2022.txt"), 
                                     col_names = c("ID_file", "Non_ref_insertions"),
                                     delim = " "),
                          read_delim(paste0(TE_RIP_dir, "Ref_TE_numbers_2022.txt"), 
                                     col_names = c("ID_file", "Ref_insertions"),
                                     delim = " "))  %>%
  mutate(Total_insertions = Ref_insertions + Non_ref_insertions) %>%
  filter(Total_insertions > 0) %>%
  right_join(., Zt_meta) %>%
  mutate(Cluster = ifelse(is.na(Cluster), "Hybrid", Cluster))


TE_qty %>%
  ggplot(aes(x = Non_ref_insertions, y = Ref_insertions, col = Continent)) +
    geom_point(alpha = .8) +
    theme_light() +
    Color_Continent +
    theme(legend.position = "None")

I investigate the TE abundance in relation to the sequencing collection, to consider whether there are biases to take into account.

#Collections comparisons
ggplot(TE_qty, aes(x = Total_insertions, col = Collection, fill = Collection)) +
  geom_density(alpha = .4) +
  theme_light() +
  labs(title = "Comparison of whole TE content estimation per collection")

Let’s investigate possible technical biases.

depth = read_tsv(paste0(vcf_dir, "Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.idepth"))
GC = read_delim(paste0(RIP_DIR, "GC_percent.txt"), col_names = c("ID_file", "TE", "Estimate", "Median_GC", "Mean_GC"), delim = " ") %>%
  dplyr::select(-TE, -Estimate)

biases = inner_join(TE_qty, depth, by = c("ID_file" = "INDV")) %>%
  inner_join(GC)


p1 = biases %>%
  ggplot(aes(x = Mean_GC, y = Total_insertions, col = Collection)) +
     geom_point(alpha = .5) +
     theme_light()


p2 = biases %>%
  ggplot(aes(x = MEAN_DEPTH, y = Total_insertions, col = Collection)) +
     geom_point(alpha = .5) +
     theme_light() 

cowplot::plot_grid(p1 + theme(legend.position = "none"), p2, rel_widths = c(1,1.9))

p1 = biases %>%
  ggplot(aes(x = MEAN_DEPTH, y = Ref_insertions, col = Collection)) +
     geom_point(alpha = .5) +
     theme_light()


p2 = biases %>%
  ggplot(aes(x = MEAN_DEPTH, y = Non_ref_insertions, col = Collection)) +
     geom_point(alpha = .5) +
     theme_light() 

cowplot::plot_grid(p1 + theme(legend.position = "none"), p2, rel_widths = c(1,1.9))

temp = biases %>% filter(!is.na(Cluster)) %>% filter(!is.na(Continent)) %>%
  filter(!is.na(Total_insertions))%>%
  filter(!is.na(MEAN_DEPTH))%>%
  filter(!is.na(Median_GC))

model1 = lm(Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC, data = temp)
summary(model1)
## 
## Call:
## lm(formula = Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC, 
##     data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -307.711  -33.709   -1.362   37.705  249.868 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 567.2297    65.3460   8.680  < 2e-16 ***
## ClusterV1    45.6478    10.1936   4.478 8.35e-06 ***
## ClusterV10  128.1398     7.6425  16.767  < 2e-16 ***
## ClusterV11  -83.9212    11.2323  -7.471 1.66e-13 ***
## ClusterV2    10.1703     6.1030   1.666 0.095921 .  
## ClusterV3    65.7702    11.2161   5.864 6.04e-09 ***
## ClusterV4    32.4357    13.3899   2.422 0.015586 *  
## ClusterV5    25.9236    11.2317   2.308 0.021188 *  
## ClusterV6   -79.6353    10.2010  -7.807 1.41e-14 ***
## ClusterV7   -71.3924    14.7251  -4.848 1.43e-06 ***
## ClusterV8    47.5169    13.4094   3.544 0.000412 ***
## ClusterV9    45.8904    14.7385   3.114 0.001898 ** 
## MEAN_DEPTH    3.9614     0.1009  39.246  < 2e-16 ***
## Median_GC    -6.5280     1.4730  -4.432 1.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54.42 on 1056 degrees of freedom
## Multiple R-squared:  0.7011, Adjusted R-squared:  0.6974 
## F-statistic: 190.5 on 13 and 1056 DF,  p-value: < 2.2e-16
model2 = lm(Total_insertions ~ Continent + MEAN_DEPTH + Median_GC, data = temp)
summary(model2)
## 
## Call:
## lm(formula = Total_insertions ~ Continent + MEAN_DEPTH + Median_GC, 
##     data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -309.484  -34.943   -1.053   37.048  254.155 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            499.1885    58.2041   8.577  < 2e-16 ***
## ContinentAsia          -11.4220    56.8060  -0.201    0.841    
## ContinentEurope         46.2850     9.7583   4.743 2.39e-06 ***
## ContinentMiddle East   -26.2534    11.1716  -2.350    0.019 *  
## ContinentNorth America 157.1649    10.5782  14.857  < 2e-16 ***
## ContinentOceania        92.9660    10.8182   8.593  < 2e-16 ***
## ContinentSouth America  73.1153    12.2715   5.958 3.47e-09 ***
## MEAN_DEPTH               4.0470     0.1044  38.781  < 2e-16 ***
## Median_GC               -5.9783     1.3129  -4.554 5.88e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.05 on 1061 degrees of freedom
## Multiple R-squared:  0.6815, Adjusted R-squared:  0.6791 
## F-statistic: 283.7 on 8 and 1061 DF,  p-value: < 2.2e-16
anova(model1, model2)
## Analysis of Variance Table
## 
## Model 1: Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC
## Model 2: Total_insertions ~ Continent + MEAN_DEPTH + Median_GC
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1   1056 3127608                                  
## 2   1061 3332809 -5   -205200 13.857 3.915e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model3 = lm(Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC + Collection, data = temp)
summary(model3)
## 
## Call:
## lm(formula = Total_insertions ~ Cluster + MEAN_DEPTH + Median_GC + 
##     Collection, data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.989  -19.193   -2.402   17.114  227.916 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      152.2514    66.9437   2.274 0.023150 *  
## ClusterV1                        -26.5866     8.5842  -3.097 0.002006 ** 
## ClusterV10                        83.1622     6.2861  13.230  < 2e-16 ***
## ClusterV11                       -63.8739     6.8743  -9.292  < 2e-16 ***
## ClusterV2                         36.2213     4.7710   7.592 6.99e-14 ***
## ClusterV3                         -1.5532     8.9855  -0.173 0.862800    
## ClusterV4                         65.0074     8.5497   7.603 6.43e-14 ***
## ClusterV5                         10.4255     7.0077   1.488 0.137128    
## ClusterV6                        -45.0890     7.1527  -6.304 4.29e-10 ***
## ClusterV7                        -36.6321     9.2621  -3.955 8.17e-05 ***
## ClusterV8                         70.9785     8.8531   8.017 2.89e-15 ***
## ClusterV9                         30.4021     9.0529   3.358 0.000813 ***
## MEAN_DEPTH                         1.7115     0.1036  16.526  < 2e-16 ***
## Median_GC                          2.1107     1.4917   1.415 0.157377    
## CollectionETHZ_2020               90.4311     5.8412  15.482  < 2e-16 ***
## CollectionHartmann_FstQst_2015   -10.6307     6.7641  -1.572 0.116342    
## CollectionHartmann_Oregon_2016   163.8927     7.1481  22.928  < 2e-16 ***
## CollectionINRAE_BIOGER            41.6198     4.9551   8.399  < 2e-16 ***
## CollectionJGI                     19.7650     8.1813   2.416 0.015869 *  
## CollectionJGI_2                   41.3786     7.4722   5.538 3.88e-08 ***
## CollectionJGI_3                   34.8982     9.0242   3.867 0.000117 ***
## CollectionJGI_4                  -57.3218    33.5403  -1.709 0.087742 .  
## CollectionJGI_5                   58.9987     6.8272   8.642  < 2e-16 ***
## CollectionMMcDonald_2018         129.7312     9.0103  14.398  < 2e-16 ***
## CollectionSyngenta               139.4985     5.7606  24.216  < 2e-16 ***
## CollectionUnine_third_batch_2018 -13.1810     8.8978  -1.481 0.138808    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.79 on 1039 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.899,  Adjusted R-squared:  0.8965 
## F-statistic: 369.8 on 25 and 1039 DF,  p-value: < 2.2e-16

The fit is indeed improved by adding the collection information, with both TE quantity estimations.

The non reference TIPs have some sort of support estimation. As a first step, I want to filter the ones that don’t seem reliable.

TE_insertions = bind_rows(read_tsv(paste0(TE_RIP_dir, "Non-ref_all_strains_2022.bed"), 
           col_names = c("ID_file", "chromosome", "position", "end", "info", "score")) %>%
             mutate(ref_non_ref_TIP = "non_ref"),
          read_tsv(paste0(TE_RIP_dir, "Ref_all_strains_2022.bed"), 
           col_names = c("ID_file", "chromosome", "position", "end", "info", "score")) %>%
             mutate(ref_non_ref_TIP = "ref")) %>%
  separate(col = chromosome, into = c("X1", "CHR", "X2", "X3", "X4"), sep = "_") %>%
  separate(col = info, into = c("TE_family", "TSD", "Allele_Frequency", "3_support", "5_support", "ref_reads"), sep = "\\|") %>%
  dplyr::select(-c(X1, X2, X3, X4, score))%>%
  inner_join(Zt_meta %>% dplyr::select(ID_file, Continent, Cluster, Sampling_Date, Collection)) %>%
  filter(!is.na(Continent)) %>% filter(Continent != "Asia") %>%
  mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
  unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) %>%
  separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"))


threshold = 0.9
subset_nb = 10000
subset = slice_sample(TE_insertions, n = subset_nb)

nb_non_filtered = nrow(TE_insertions)
nb_filtered = nrow(filter(TE_insertions, is.na(Allele_Frequency) | Allele_Frequency > threshold))

temp = nrow(filter(subset, as.numeric(Allele_Frequency) < threshold))
ggplot(subset, aes(as.numeric(Allele_Frequency))) +
  geom_density() +
  geom_vline(aes(xintercept = threshold), color = "#82C0CC") +
  geom_label(x = 0.5, y = 100, fill = "white", 
            label = str_wrap(paste0((100*temp/subset_nb), "% TE with allelic frequency below ", threshold,
                             ". \n   The dataset goes from ", nb_non_filtered, " detected_TE_insertions to ",
                             nb_filtered, " after filtering."), 70)) +
  theme_light() + 
  labs(x = "Allele Frequency of the TIPs PAV for each isolate",
       title = "Threshold for non-reference TIP filtering")

TE_insertions = filter(TE_insertions, is.na(Allele_Frequency) | Allele_Frequency > threshold)

Once, this is done, I want to explore some basic statistics about the TIPs. In many other studies, it seems that the TIPs SFS is biased toward lower frequency. As shown above, we have a depth bias here, so it is likely that this bias, if found in Z. tritici would be made even stronger by a non-detection artefact in low depth genomes. So in that context, here are some questions of interest: What are the frequency of the TE insertions we found? Are they shared by several samples?

TE_insertions_counts = TE_insertions %>%
  unite(Continent, ID_file, col = "for_display", remove = F) %>%
  dplyr::count(TE_insertion, ref_non_ref_TIP)


# SFS
p1 = ggplot(TE_insertions_counts, aes(x = n)) +
   geom_density( col = "#2EC4B6", fill = "#CBF3F0") +  theme_light() +
   labs(x = "TE insertion count") +
  scale_x_continuous(trans = "log10")




data = TE_insertions_counts %>%
  mutate(for_bar = case_when(n == 1 ~ "01", 
                             n == 2 ~ "02",
                             n <= 10 ~ "10",
                             n <= 100 ~ "100",
                             n >= 100 ~ "100 +")) %>%
  dplyr::count(for_bar)

data$fraction = data$n / sum(data$n)
data$ymax = cumsum(data$fraction) # Compute the cumulative percentages (top of each rectangle)
data$ymin = c(0, head(data$ymax, n=-1)) # Compute the bottom of each rectangle
data$labelPosition <- (data$ymax + data$ymin) / 2
data$label1 <- ifelse(round(data$fraction*100) > 2, paste0(round(data$fraction*100), "%"), "")
data$label2 <- ifelse(round(data$fraction*100) > 2, "", paste0(round(data$fraction*100), "%"))


# Doughnut chart of frequencies
p2 = ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=for_bar)) +
  geom_rect() +
  coord_polar(theta="y") + 
  xlim(c(2,4)) + 
  theme_void() +
  geom_text( x=3.5, aes(y=labelPosition, label=label1), size=4) +
  geom_text( x=4.2, aes(y=labelPosition, label=label2), size=4) +
  scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) +
  labs(fill = "Number of strains",
       title = "Three quarter of all TE insertions are singletons.")

cowplot::plot_grid(p1, p2)

#TE insertions per Superfamily and frequency

TE_insertions_counts %>%
  separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
  separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
                Length = abs(as.numeric(Start) - as.numeric(End)), 
                TIP_nb = n) %>%
  dplyr::count(Order, Superfamily, TIP_nb) %>%
  mutate(for_bar = case_when(TIP_nb == 1 ~ "01", 
                             TIP_nb == 2 ~ "02",
                             TIP_nb <= 10 ~ "10",
                             TIP_nb <= 100 ~ "100",
                             TIP_nb >= 100 ~ "100 +")) %>%
  ggplot(aes(x = Superfamily, y = n, fill = for_bar)) +
  geom_bar(stat= "identity") + 
  theme_light() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) + 
  facet_wrap(vars(Order), scales = "free_x")+
  scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) 

Although most TIPs are found a low to very low frequency in our analysis, there are some that are identified in several samples. Among the TE insertions which are found multiple times, are the TIPs shared by isolates from different clusters or are they always from a single cluster?

theshold = 5

#Filtering only insertions found more than 5 times
insertions_per_pop = TE_insertions %>%
  filter(!is.na(Cluster)) %>%
  dplyr::count(Cluster, TE_insertion, ref_non_ref_TIP) %>%
  filter(n > theshold) %>%
  pivot_wider(names_from = Cluster, values_from = n) %>%
  mutate(n = 1) %>%
  dplyr::rowwise() %>%
  dplyr::mutate(NA_count = sum(is.na(c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11))),
                Nb_population = 11 - NA_count)


#Number of TIPS shared by populations
temp2 = insertions_per_pop %>% 
  dplyr::count(Nb_population, name = "Insertion_number") %>%
  mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific", 
                                 ifelse(Nb_population == 11, "Shared by all populations", "Intermediate")),
         label = ifelse(Nb_population == 1, paste0("N=", Insertion_number), 
                                 ifelse(Nb_population == 11, paste0("N=", Insertion_number), "")))
prop_non_spe = round(sum(temp2$Insertion_number[temp2$Insertion_type != "Population specific"])/sum(temp2$Insertion_number)*100)

ggplot(temp2, aes(x = reorder(as.character(Nb_population), Nb_population), 
                  y = Insertion_number, fill = Insertion_type)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(y = Insertion_number + 30, label = label), size=3) +
  geom_label(x = 7, y = max(temp2$Insertion_number)/2, 
            label = str_wrap(paste0(prop_non_spe, "% of insertions are found in two or more populations"), 20),
            fill = "white") + 
  theme_light() + 
  scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6")) +
  labs(x = "Number of populations displaying an insertion", 
       y = "Number of TE insertions",
       title = str_wrap(paste0("Two thirds of TE insertions found in more than ", 
                               theshold, " isolates are population-specific."),55))

So far, I’ve looked at TIPs from all TE categories together. I’m curious to see how it matches with the superfamilies and orders of TEs.

temp2 = insertions_per_pop %>%
  mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific",
                                 ifelse(Nb_population == 11, "Shared by all populations", "Intermediate"))) %>%
  dplyr::select(TE_insertion, Insertion_type)

#The insertions per pop were filtered for only TIP higher than 5, so temp2 and the following plot as well
right_join(TE_insertions_counts, temp2) %>% 
  separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
  separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
                Length = abs(as.numeric(Start) - as.numeric(End))) %>%
  dplyr::count(Order, Superfamily, Insertion_type) %>%
  ggplot(aes(x = Superfamily, y = n, fill = Insertion_type)) +
  geom_bar(stat= "identity") + 
  theme_light() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) + 
  facet_wrap(vars(Order), scales = "free")+ 
  scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6"))

TE in space: population differentiation

Second, I check whether a plot and statistics including the depth of coverage still recovers the difference between groups that I have observed above.

inner_join(filter(TE_insertions_counts, n > 0), TE_insertions) %>%
  group_by(ID_file, Continent, Cluster) %>%
  dplyr::count() %>%
  inner_join(depth, by = c("ID_file" = "INDV")) %>%
  ggplot(aes(x = MEAN_DEPTH, y = n)) +
     geom_point(aes(shape = Cluster, col = Cluster), alpha = .8, size = 2) +
     theme_light() +
     labs(title = "Depth bias check TIP comparison between cluster",
          subtitle = "All TIPs (population specific and others)",
          x = "Mean depth of coverage", 
          y = paste0("Number of TIP")) +
     Color_Cluster + Shape_Cluster

inner_join(filter(TE_insertions_counts, n > 0), TE_insertions) %>%
  group_by(ID_file, Continent, Cluster) %>%
  dplyr::count() %>%
  inner_join(depth, by = c("ID_file" = "INDV")) %>%
  ggplot(aes(x = MEAN_DEPTH, y = n, color = Continent)) +
     geom_point(alpha = .2, size = 2) +
     geom_smooth(aes(), se = F, method = "glm", formula = y~log(x)) +
     theme_light() +
     labs(title = "Depth bias check TIP comparison between continent",
          subtitle = "All TIPs (population specific and others)",
          x = "Mean depth of coverage", 
          y = paste0("Number of TIP")) +
     Color_Continent 

Let’s compare the TE content per population and per continent. The statistics used here are a one-way ANOVA with block. Blocks are used in an analysis of variance or similar models in order to account for suspected variation from factors other than the treatments or main independent variables being investigated. Here I considered the collection as the confounding factor. It definitely has an effect and was thus accounted for in the statistics related to TE content and to RIP level.

Genomes from isolates in Oceania and the Americas seem to contain more TE than those from the Middle-East, in particular.

Let’s see if the same result is obtained with the TE insertions as detected by ngs_te_mapper2.

TE_insertion_wrld_average = TE_qty  %>%
  dplyr::summarize(avg = mean(as.numeric(Total_insertions), na.rm = T)) %>%
  pull(avg)

temp = filter(TE_qty, !is.na(Continent) & Continent != "Asia")%>%
  inner_join(depth, by = c("ID_file" = "INDV")) 

ggplot(temp, aes(MEAN_DEPTH)) + geom_density() + geom_vline(xintercept = 35)

#Cluster
model = lm(Total_insertions ~  Continent, data=filter(temp, MEAN_DEPTH <= 35))
CLD = cld(lsmeans(model, ~ Continent), alpha   = 0.05, Letters = letters, adjust  = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)

filter(temp, MEAN_DEPTH <= 35) %>%
  ggplot(aes(x = Continent, y = Total_insertions, fill = Continent)) +
    geom_violin(alpha = .8)  +
  theme_light()  +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Number of insertions per sample", width = 30),
        title = "TE insertions per continent",
        subtitle = str_wrap(paste(""), width = 70)) +
  geom_text(data = CLD, aes(x = Continent, label = .group, y = 750), color   = "black") +
  Fill_Continent +
  stat_summary(aes(x = Continent,  y = Total_insertions), fun = mean, geom = "point", size = 2, color = "grey30") +
  geom_hline(aes(yintercept = TE_insertion_wrld_average), color = "gray30", size = 0.6, linetype = "dashed")

filter(temp, MEAN_DEPTH <= 35) %>%
  group_by(Continent) %>%
  summarize(Median_insertions = round(median(Total_insertions, na.rm = T)), 
            Mean_insertions = round(mean(Total_insertions, na.rm = T)),
            Min_insertions = min(Total_insertions, na.rm = T), 
            Max_insertions = max(Total_insertions, na.rm = T))
## # A tibble: 6 × 5
##   Continent     Median_insertions Mean_insertions Min_insertions Max_insertions
##   <chr>                     <dbl>           <dbl>          <dbl>          <dbl>
## 1 Africa                      300             304            184            429
## 2 Europe                      334             340            209            669
## 3 Middle East                 302             282            164            394
## 4 North America               462             456            283            586
## 5 Oceania                     373             375            258            513
## 6 South America               375             364            237            476
TE_insertion_wrld_average = TE_qty  %>%
  dplyr::summarize(avg = mean(as.numeric(Total_insertions), na.rm = T)) %>%
  pull(avg)


#Cluster
model = lm(Total_insertions ~  Cluster, data=TE_qty)
CLD = cld(lsmeans(model, ~ Cluster), alpha   = 0.05, Letters = letters, adjust  = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)

TE_qty %>%
  filter(!is.na(Cluster)) %>%
  ggplot(aes(x = Cluster, y = Total_insertions, fill = Cluster)) +
    geom_violin(alpha = .8)  +
  theme_light()  +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Number of insertions per sample", width = 30),
        title = "TE insertions per cluster",
        subtitle = str_wrap(paste(""), width = 70)) +
  geom_text(data = CLD, aes(x = Cluster, label = .group, y = 750), color   = "black") +
  Fill_Cluster +
  stat_summary(aes(x = Cluster,  y = Total_insertions), fun = mean, geom = "point", size = 2, color = "grey30") +
  geom_hline(aes(yintercept = TE_insertion_wrld_average), color = "gray30", size = 0.6, linetype = "dashed")

The pattern is somewhat different with the TE insertions. In this case, the Oceania samples don’t look particularly high. However, the pattern with the African and Middle-Eastern samples being lower is even clearer.

I know for sure that my estimates are biased, either because of depth, GC content, or both. So I want to make some more checks to ensure that I am confident in the results I show. First, I want to try and compare the clusters intra-collection to ensure that the results observed previously can be confirmed within each collection.

temp = TE_qty %>%
  filter(Cluster != "NA") %>%
  dplyr::count(Collection, Cluster) %>%
  mutate(n = ifelse(n < 5, NA, n)) %>%
  pivot_wider(names_from = Cluster, values_from = n)%>%
  rowwise() %>%
  dplyr::mutate(NA_count = sum(is.na(c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11))))
                
temp
## # A tibble: 16 × 14
## # Rowwise: 
##    Collection Hybrid    V2   V10   V11    V5    V6    V7    V8    V9    V4    V1
##    <chr>       <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
##  1 Eschikon_…     NA    93    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  2 ETHZ_2020      24    NA    24    14    22    14     7     8    12    NA    NA
##  3 Hartmann_…     NA    29    50    NA    NA    25    NA    NA    NA    27    NA
##  4 Hartmann_…     NA    NA    94    NA    NA    NA    NA    NA    NA    NA    NA
##  5 INRAE_BIO…      5   102    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  6 JGI             6    13     6    NA    NA    NA    NA    10    NA    NA    NA
##  7 JGI_1          NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  8 JGI_2          11    11    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  9 JGI_3           6     8    NA    NA    NA    NA     6    NA    NA    NA    NA
## 10 JGI_4          NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 11 JGI_5          18    11    NA     7    NA    NA    NA    NA    NA    NA    NA
## 12 Megan_McD…     NA    NA    NA    NA    NA    NA    NA    NA    NA     9    NA
## 13 MMcDonald…     21    NA    NA    NA    NA    NA    NA    NA    NA    NA    40
## 14 Syngenta        7   273    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 15 Unine_thi…     NA     8    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 16 <NA>           NA     6    NA    NA    NA    NA    NA    NA    NA    NA    NA
## # … with 2 more variables: V3 <int>, NA_count <int>
collections_multiple = temp %>% 
  filter(NA_count <= 9) %>% 
  pivot_longer(cols = -c(Collection, NA_count), names_to = "Cluster", values_to = "n") %>% 
  filter(!is.na(n))
inner_join(TE_qty, dplyr::select(collections_multiple, Collection, Cluster)) %>%  
ggplot() +
  theme_light()  +
  theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Percentage of reads", width = 30),
        subtitle = str_wrap(paste(""), width = 70)) +
  geom_boxplot(aes(x = Cluster, y = Total_insertions, fill = Cluster), alpha = .8) +
  Fill_Cluster + Color_Cluster +
  labs(title = "TE insertions per collection per cluster") + 
  facet_wrap(vars(Collection), scales = "free")

We can see that different clusters and continent have a different overall TE content. However, I am curious to see if there is a geographical clustering intrinsic to the TE content that can be uncovered without an a priori sorting of samples.

temp = TE_insertions %>% dplyr::count(Superfamily, ID_file, Order, Continent, name = "Nb_TIP")

p1 = filter(temp, Order == "Class I (retrotransposons)") %>%
  ggplot(aes(x = Superfamily,
             y = Nb_TIP,
             color = Continent)) +
    geom_boxplot(outlier.shape = NA) +
    theme_light() +
    labs(y = "TIPs") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))   +
  Color_Continent 

p2 = filter(temp, Order == "Class II (DNA transposons)")%>%
  ggplot(aes(x = Superfamily,
             y = Nb_TIP,
             color = Continent)) +
    geom_boxplot(outlier.shape = NA) +
    theme_light() +
    ylab("TIPs") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))  +
  Color_Continent

temp = cowplot::plot_grid(p1 + theme(legend.position = "None", axis.title.x = element_blank()) +
                                        scale_y_continuous(trans = "log10"),
                   p2 + theme(legend.position = "None") + scale_y_continuous(trans = "log10"), 
                   ncol = 1)
cowplot::plot_grid(temp, get_legend(p1), nrow = 1, rel_widths = c(1, 0.3))

# Per cluster
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}


scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}


TIP_per_superfamily_per_sample = TE_insertions %>% dplyr::count(Superfamily, ID_file, Order, Cluster, name = "Nb_TIP") %>%
  filter(!is.na(Cluster)) %>% group_by(Order, Superfamily) %>% 
  mutate(General_median = median(Nb_TIP))

filter(TIP_per_superfamily_per_sample, Order == "Class I (retrotransposons)") %>%
  filter(General_median >= 8 ) %>%
  ggplot(aes(x = reorder_within(Cluster, Nb_TIP, Superfamily, median),
             y = Nb_TIP,
             color = Cluster))  +
    geom_violin(aes(fill = Cluster), alpha = .4) +
    geom_boxplot(outlier.shape = NA, width = .1) +
    theme_light() +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))  +
  Color_Cluster + Fill_Cluster + 
  facet_wrap(vars(Superfamily), scales = "free") +
  labs(title = "Class I (retrotransposons)", x ="Genetic cluster", y = "TIPs") + 
  scale_x_reordered()

filter(TIP_per_superfamily_per_sample, Order == "Class II (DNA transposons)") %>%
  filter(General_median >= 8 ) %>%
  ggplot(aes(x = reorder_within(Cluster, Nb_TIP, Superfamily, median),
             y = Nb_TIP,
             color = Cluster))  +
    geom_violin(aes(fill = Cluster), alpha = .4) +
    geom_boxplot(outlier.shape = NA, width = .1) +
    theme_light() +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))  +
    Color_Cluster + Fill_Cluster + 
    facet_wrap(vars(Superfamily), scales = "free") +
    labs(title = "Class II (DNA transposons)", x ="Genetic cluster", y = "TIPs") + 
    scale_x_reordered()

TIP_per_superfamily_per_sample %>%
  group_by(ID_file) %>%
  mutate(Nb_TIP_per_ID = sum(Nb_TIP)) %>%
  ggplot(aes(reorder_within(x = ID_file, by = Nb_TIP_per_ID, within = Cluster), Nb_TIP, fill = Superfamily)) +
  geom_bar(stat = "identity") +
  facet_wrap(vars(Cluster), scales = "free_x") +
    theme(axis.text.x = element_blank())

TIP_per_superfamily_per_sample %>%
  group_by(ID_file) %>%
  mutate(Nb_TIP_per_ID = sum(Nb_TIP)) %>%
  ggplot(aes(ID_file, Nb_TIP, fill = Superfamily)) +
  geom_bar(stat = "identity", position = "fill") +
    theme(axis.text.x = element_blank())+
  facet_grid(col = vars(Cluster), scales = "free_x",  space = "free") + 
  theme(panel.spacing.x=unit(0.2, "lines"),panel.spacing.y=unit(1, "lines"))

Let’s compare TE content using the Middle-Eastern and African clusters as a baseline reference.

# Using the three Middle-Eastern and African 
average_TIP_origin = TIP_per_superfamily_per_sample %>% 
  filter(Cluster %in% c("V6", "V7", "V11")) %>%
  group_by(Superfamily, Order) %>% 
  summarize(Median_origin = median(Nb_TIP))


# Class I (retrotransposons)
temp = full_join(TIP_per_superfamily_per_sample, average_TIP_origin) %>%
  filter(General_median >= 8 ) %>%
  filter(!(Cluster %in% c("V6", "V7", "V11"))) %>%
  mutate(Ratio_TIP_to_origin = Nb_TIP/Median_origin) %>%
  filter(Order == "Class I (retrotransposons)")

temp2 = temp %>% group_by(Cluster, Superfamily) %>%
  summarize(Med = median(Ratio_TIP_to_origin))
ggplot()  +
    geom_violin(data = temp, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"), 
                                 y = Ratio_TIP_to_origin, fill = Cluster), col = NA, alpha = .3) +
    #geom_jitter(data = temp, aes(x = Cluster, y = Ratio_TIP_to_origin, color = Cluster), width = .1, alpha = .1)  +
    geom_hline(yintercept = 1) +
    geom_segment(data = temp2, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"), 
                                   xend = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"), 
                                   y = 1, yend = Med, color = Cluster), size = 1) +
    geom_point(data = temp2, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"), 
                                 y = Med, color = Cluster), size = 2) +
    theme_light() +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))  +
    Color_Cluster + Fill_Cluster + 
    facet_wrap(vars(Superfamily), scales = "free_y") +
    labs(title = "Class I (retrotransposons)", x ="Genetic cluster", y = "TIPs ratio to origin") + 
    scale_x_reordered()

ggsave("Large_TIP_per_family_prop_classI.pdf", 
       dpi = 500, width = 33, height = 15, units = "cm")

# Class II (DNA transposons)
temp = full_join(TIP_per_superfamily_per_sample, average_TIP_origin) %>%
  filter(General_median >= 8 ) %>%
  filter(!(Cluster %in% c("V6", "V7", "V11"))) %>%
  mutate(Ratio_TIP_to_origin = Nb_TIP/Median_origin) %>%
  filter(Order == "Class II (DNA transposons)")

temp2 = temp %>% group_by(Cluster, Superfamily) %>%
  summarize(Med = median(Ratio_TIP_to_origin))
ggplot()  +
    geom_violin(data = temp, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"), 
                                 y = Ratio_TIP_to_origin, fill = Cluster), col = NA, alpha = .3) +
    #geom_jitter(data = temp, aes(x = Cluster, y = Ratio_TIP_to_origin, color = Cluster), width = .1, alpha = .1)  +
    geom_hline(yintercept = 1) +
    geom_segment(data = temp2, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"), 
                                   xend = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"), 
                                   y = 1, yend = Med, color = Cluster), size = 1) +
    geom_point(data = temp2, aes(x = fct_relevel(Cluster, "V2", "V10", "V8", "V5", "V9", "V1", "V3", "V4"), 
                                 y = Med, color = Cluster), size = 2) +
    theme_light() +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))  +
    Color_Cluster + Fill_Cluster + 
    facet_wrap(vars(Superfamily), scales = "free_y") +
    labs(title = "Class II (DNA transposons)", x ="Genetic cluster", y = "TIPs ratio to origin") + 
    scale_x_reordered()

ggsave("Large_TIP_per_family_prop_classII.pdf", 
       dpi = 500, width = 25, height = 15, units = "cm")

Per TE family now, instead of superfamily. Can we identify individual TEs that have spread in the genomes outside of the center of origin?

TIP_per_family_per_sample = TE_insertions %>% dplyr::count(TE_family, Superfamily, ID_file, Order, Cluster, name = "Nb_TIP") %>%
  filter(!is.na(Cluster)) %>% group_by(Order, Superfamily, TE_family) %>% 
  mutate(General_median = median(Nb_TIP))

to_keep = unique(filter(TIP_per_family_per_sample, Nb_TIP > 10)$TE_family)

TIP_per_family_per_sample %>% filter(TE_family %in% to_keep) %>%
  filter(Order == "Class I (retrotransposons)") %>%
  separate(TE_family, into = c("TE_family_short", "Temp"), remove = F, sep = "-") %>%
  ggplot(aes(x = reorder_within(Cluster, by = Nb_TIP, fun = median, within = TE_family_short),
             y = Nb_TIP,
             color = Cluster))  +
    geom_violin(aes(fill = Cluster), alpha = .4) +
    geom_boxplot(outlier.shape = NA, width = .1) +
    theme_light() +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))  +
    Color_Cluster + Fill_Cluster + 
    facet_wrap(vars(TE_family_short), scales = "free") +
    labs(title = "Class I (retrotransposons)", x ="Genetic cluster", y = "TIPs") + 
    scale_x_reordered()

ggsave("Large_TIP_per_family_per_cluster_classI.pdf", 
       dpi = 500, width = 30, height = 20, units = "cm")

TIP_per_family_per_sample %>% filter(TE_family %in% to_keep) %>%
  filter(Order == "Class II (DNA transposons)") %>%
  separate(TE_family, into = c("TE_family_short", "Temp"), remove = F, sep = "-") %>%
  ggplot(aes(x = reorder_within(Cluster, by = Nb_TIP, fun = median, within = TE_family_short),
             y = Nb_TIP,
             color = Cluster))  +
    geom_violin(aes(fill = Cluster), alpha = .4) +
    geom_boxplot(outlier.shape = NA, width = .1) +
    theme_light() +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))  +
    Color_Cluster + Fill_Cluster + 
    facet_wrap(vars(TE_family_short), scales = "free") +
    labs(title = "Class II (DNA transposons)", x ="Genetic cluster", y = "TIPs") + 
    scale_x_reordered()

ggsave("Large_TIP_per_family_per_cluster_classII.pdf", 
       dpi = 500, width = 36, height = 20, units = "cm")

Let’s get some number of the text.

temp = TE_insertions %>% 
  dplyr::count(TE_family, ID_file, Order, Continent, name = "Nb_TIP") %>%
  filter(!is.na(Continent) & Continent != "Asia") 

x = filter(temp, Continent %in% c("Africa", "Middle East")) %>%
  filter(TE_family == "RLC_Deimos_consensus-1") %>% pull(Nb_TIP) %>% mean()
paste0("The mean number of RLC Deimos per isolate in the center of origin is ", x)
## [1] "The mean number of RLC Deimos per isolate in the center of origin is 11.3137254901961"
x = filter(temp, !(Continent %in% c("Africa", "Middle East")))%>%
  filter(TE_family == "RLC_Deimos_consensus-1") %>% pull(Nb_TIP) %>% mean()
paste0("The mean number of RLC Deimos per isolate outside the center of origin is ", x)
## [1] "The mean number of RLC Deimos per isolate outside the center of origin is 50.784375"
x = filter(temp, Continent %in% c("Africa", "Middle East")) %>%
  filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% median()
y = filter(temp, Continent %in% c("Africa", "Middle East")) %>%
  filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% mean()
z = filter(temp, Continent %in% c("Africa", "Middle East")) %>%
  filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% max()
print(paste(x, y, z, sep = " "))
## [1] "1 1.77586206896552 17"
paste0("The median number of RLX Styx per isolate in the center of origin is ", x)
## [1] "The median number of RLX Styx per isolate in the center of origin is 1"
x = filter(temp, !(Continent %in% c("Africa", "Middle East")))%>%
  filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% median()
y = filter(temp, !(Continent %in% c("Africa", "Middle East"))) %>%
  filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% mean()
z = filter(temp, !(Continent %in% c("Africa", "Middle East"))) %>%
  filter(TE_family == "RLX_Styx_consensus-1_partial") %>% pull(Nb_TIP) %>% max()
print(paste(x, y, z, sep = " "))
## [1] "5 5.79337899543379 26"
paste0("The median number of RLX Styx per isolate outside the center of origin is ", x)
## [1] "The median number of RLX Styx per isolate outside the center of origin is 5"

TE clustering

First, I will use the ngs_te_mapper2 results, and use each insertion identified in more than 5 samples to create different clustering. First, I’ll simply create a heatmap, then I’ll move on to PCAs, first by cluster, then with each isolate.

#PCA per isolate
matrice = TE_insertions %>%
  #filter(!is.na(Cluster)) %>%
  mutate(Cluster = ifelse(is.na(Cluster), "Hybrid", Cluster)) %>%
  inner_join(filter(TE_insertions_counts, n > 10)) %>%
  dplyr::count(TE_insertion, ID_file, Cluster) %>%
  group_by(ID_file) %>%
  mutate(Count_strain = sum(n)) %>%
  pivot_wider(names_from = TE_insertion, values_from = n, values_fill = 0) 


temp = as.matrix(matrice[,c(3:ncol(matrice))])[, which(apply(as.matrix(matrice[,c(3:ncol(matrice))]), 2, var) != 0)]

TE.pca = prcomp(temp, center = TRUE, scale. = TRUE)

#Plot

temp = as.tibble(cbind(matrice, as.data.frame(TE.pca$x))) %>%
  dplyr::select(ID_file, Cluster, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8)

p1 = ggplot(temp, aes(x = PC1, y = PC2, col = Cluster, shape = Cluster)) +
  geom_point() +
  Color_Cluster3 + Shape_Cluster2 +
  theme_light()
p2 = ggplot(temp, aes(x = PC3, y = PC4, col = Cluster, shape = Cluster)) +
  geom_point() +
  Color_Cluster3 + Shape_Cluster2 +
  theme_light() + theme(legend.position = "none")
p3 = ggplot(temp, aes(x = PC5, y = PC6, col = Cluster, shape = Cluster)) +
  geom_point() +
  Color_Cluster3 + Shape_Cluster2 +
  theme_light() + theme(legend.position = "none")
p4 = ggplot(temp, aes(x = PC7, y = PC8, col = Cluster, shape = Cluster)) +
  geom_point() +
  Color_Cluster3 + Shape_Cluster2 +
  theme_light() + theme(legend.position = "none")
temp = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2, p3, p4)
cowplot::plot_grid(temp, get_legend(p1),
                   ncol = 2, rel_widths = c(1, .1))

It does look like there is clustering of samples according to geography. This is interesting as it shows that the types of TEs are different in different populations.


TE insertions: where in the genome?

I want to gain some insights about where, in the genome, are these TIPs we have detected: are they on specific chromosomes? Is there a difference between core and accessory chromosomes? Can we detect hotspots of transposition, i.e., are there places in the genome which tend to have more TIPs than others?

TE_insertions_per_window = TE_insertions %>%
  mutate(CHR = as.numeric(CHR),
         window = trunc(((position + end)/2)/10000)) %>%
  dplyr::select(TE_insertion, CHR, window) %>%
  distinct() %>%
  group_by(CHR, window) %>%
  dplyr::count() 

summar = ungroup(TE_insertions_per_window) %>% 
  summarise(median_TE = median(n),
            sd_TE = sd(n))

TE_insertions_per_window  = TE_insertions_per_window %>%
  #inner_join(summar) %>%
  mutate(outlier = ifelse(n >= summar$median_TE + 3*summar$sd_TE, "outlier", "non_outlier")) 

TE_insertions_per_window %>%
  ungroup() %>%
  mutate(Start = window*10000, End = Start + 10000) %>%
  dplyr::select(-window, TIP_count = n, TIP_category = outlier) %>%
  write_tsv(file = paste0(TE_RIP_dir, "Ztritici_global_March2021.good_samples.10kb_windows.TIP_counts.tab"))

filter(TE_insertions_per_window, as.numeric(CHR) < 8) %>%
  ggplot(aes(x = window, y = n, col = as.character(outlier))) +
    theme_light() + 
    theme(legend.position = "none") +
    scale_color_manual(values = c("#f5cac3", "#f28482")) +
    geom_point(alpha = .8) +
    geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
               linetype="dashed", col = "grey80") +
    facet_grid(rows = vars(CHR), scales = "free_x") 

filter(TE_insertions_per_window, as.numeric(CHR) >= 8 & as.numeric(CHR) < 14) %>%
  ggplot(aes(x = window, y = n, col = outlier)) +
    theme_light() + 
    theme(legend.position = "none") +
    scale_color_manual(values = c("#f5cac3", "#f28482")) +
    geom_point(alpha = .8) +
    geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
               linetype="dashed", col = "grey80") +
    facet_grid(rows = vars(CHR), scales = "free_x")

filter(TE_insertions_per_window, as.numeric(CHR) >= 14) %>%
  ggplot(aes(x = window, y = n, col = outlier)) +
    theme_light() + 
    theme(legend.position = "none") +
    scale_color_manual(values = c("#f5cac3", "#f28482")) +
    geom_point(alpha = .8) +
    geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
               linetype="dashed", col = "grey80") +
    facet_grid(rows = vars(CHR), scales = "free_x")

label1 = ungroup(TE_insertions_per_window) %>%
  filter(!is.na(CHR)) %>%
  dplyr::summarize(Average = mean(n)) %>% pull()
p1 = ungroup(TE_insertions_per_window) %>%
  filter(!is.na(CHR)) %>%
  group_by(CHR) %>%
  dplyr::summarize(Average = mean(n)) %>%
  dplyr::mutate(Chromosome_type = ifelse(CHR < 14, "core", "accessory")) %>%
  ggplot(aes(x = reorder(as.character(CHR), CHR), y = Average, fill = Chromosome_type)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = label1, linetype = "dashed", col = "grey40") + 
  theme_light() +
  scale_fill_manual(values = c("#ffa62b", "#2FC6B7")) +
  labs(title = paste0("GW TIP number average is ", round(label1, 2)), 
       x = "Chromosome", y = "Mean # TIPs per 10kb") +
  theme(legend.position = "none")


p2 = ungroup(TE_insertions_per_window) %>%
  dplyr::count(CHR, outlier) %>%
  group_by(CHR) %>%
  dplyr::mutate(Tot_window = sum(n)) %>%
  mutate(prop = 100*n/Tot_window) %>%
  mutate(Chromosome_type = ifelse(CHR < 14, "core", "accessory")) %>%
  filter(outlier == "outlier") %>%
  ggplot(aes(x = reorder(as.character(CHR), CHR), y = prop, fill = Chromosome_type)) +
  geom_bar(stat = "identity") +
  theme_light()  +
  scale_fill_manual(values = c("#ffa62b", "#2FC6B7")) +
  labs(title = "Proportion of outlier windows", 
       x = "Chromosome", y = "% outlier windows") +
  theme(legend.position = "none") +
  geom_text(aes(y = 22, label = paste0(round(prop, 2), "%")), size = 3)

cowplot::plot_grid(p1 + coord_flip(), p2 + coord_flip())

ungroup(TE_insertions_per_window) %>%
  dplyr::count(CHR, outlier) %>%
  group_by(CHR) %>%
  dplyr::mutate(Tot_window = sum(n)) %>%
  mutate(prop = 100*n/Tot_window) %>%
  filter(outlier == "outlier")
## # A tibble: 19 × 5
## # Groups:   CHR [19]
##      CHR outlier     n Tot_window  prop
##    <dbl> <chr>   <int>      <int> <dbl>
##  1     1 outlier     4        592 0.676
##  2     2 outlier     4        371 1.08 
##  3     3 outlier     6        330 1.82 
##  4     4 outlier     2        271 0.738
##  5     5 outlier     6        269 2.23 
##  6     6 outlier     5        249 2.01 
##  7     7 outlier     1        254 0.394
##  8     8 outlier     4        226 1.77 
##  9     9 outlier     9        206 4.37 
## 10    10 outlier     5        161 3.11 
## 11    11 outlier     3        158 1.90 
## 12    13 outlier     4        115 3.48 
## 13    14 outlier     3         70 4.29 
## 14    15 outlier     6         62 9.68 
## 15    17 outlier     5         52 9.62 
## 16    18 outlier     1         52 1.92 
## 17    19 outlier     5         52 9.62 
## 18    20 outlier     1         45 2.22 
## 19    21 outlier     3         37 8.11
ungroup(TE_insertions_per_window) %>% dplyr::count(outlier) 
## # A tibble: 2 × 2
##   outlier         n
##   <chr>       <int>
## 1 non_outlier  3697
## 2 outlier        77

I now want to investigate the link between the TIP and the genome compartmentalization in terms of transcriptomics and epigenomics.

# Uploading commands from the cluster to my computer.

#Per windows: coordinates, RIP, GC, SNP counts
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed
 
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv 
 
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab
 
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab  \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/4_Joint_calling/
 
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/

rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/
 
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/7_ChipSeq_peaks ../1_Variant_calling/

#This is done for each histone mark
#The data is from Klaas paper's on centromeres
#The peaks were called by macs2 (option --nomodel) after trimming and mapping with trimmomatic and bowtie + filtering of reads mapping with a quality sup to 30

grep -v "#" ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep1_SRR2060839_peaks.xls  | cut -f 1,2,3,10 | \
   grep "peak" > ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep1_SRR2060839_peaks.bed
grep -v "#" ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep2_SRR2060840_peaks.xls  | cut -f 1,2,3,10 | \
   grep "peak" > ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep2_SRR2060840_peaks.bed

~/Documents/Software/bedtools2/bin/bedtools intersect \
   -a ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep1_SRR2060839_peaks.bed \
   -b ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_Rep2_SRR2060840_peaks.bed \
   > ../1_Variant_calling/7_ChipSeq_peaks/H3K9me3_intersect_only_overlap.bed 



#Get length of overlap between genes and windows for RNAseq
~/Documents/Software/bedtools2/bin/bedtools intersect -wo \
  -a ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
  -b ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.gff3  \
  > ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.intersect_genes.txt
#Getting mappability per window
mappability = read_tsv(paste0(data_dir, "Zymoseptoria_tritici_genmap_mappability.bedgraph"), 
         col_names = c("CHR", "start", "end", "mappab"), col_types = c("c", "d", "d", "d")) %>%
  dplyr::mutate(Length = (end - start), window = trunc(((start + end)/2)/10000)) %>%
  group_by(CHR, window) %>%
  dplyr::summarize(Mappability = weighted.mean(x = mappab, w = Length)) %>% 
  mutate(Start = window*10000) %>%
  ungroup() %>%
  dplyr::select(-window)


#Getting the methylation peaks
chipseq = bind_rows(read_tsv(paste0(chipseq_dir, "H3K27me3_intersect_only_overlap.bed"), 
         col_names = c("CHR", "start", "end", "peak"), col_types = c("c", "d", "d", "c")),
    read_tsv(paste0(chipseq_dir, "H3K4me2_intersect_only_overlap.bed"), 
         col_names = c("CHR", "start", "end", "peak"), col_types = c("c", "d", "d", "c"))) %>%
    bind_rows(read_tsv(paste0(chipseq_dir, "H3K9me2_intersect_only_overlap.bed"), 
         col_names = c("CHR", "start", "end", "peak"), col_types = c("c", "d", "d", "c"))) %>%
      separate(col = peak, into = c("Mark"), sep ="_", remove = FALSE, extra = "drop") %>%
  dplyr::mutate(Length = (end - start), window = trunc(((start + end)/2)/10000)) %>%
  group_by(CHR, window, Mark) %>%
  dplyr::summarize(Coverage = sum(Length)/10000) %>% 
  mutate(Start = window*10000) %>%
  ungroup() %>%
  dplyr::select(-window) %>%
  pivot_wider(names_from = Mark, values_from = Coverage, values_fill = 0)



#RNAseq
TPM = read_tsv(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.intersect_genes.txt"), 
         col_names = c("CHR", "Start", "End", "X1", "X2", "X3", "Gene_start", "Gene_end",
                       "X4", "X5", "X6", "Annot", "Overlap")) %>%
  dplyr::select(CHR, Start, End, Annot, Overlap) %>%
  separate(col = Annot, into = c("X1", "X2", "X3", "X4","X5", "Protein_ID"), sep = "[=;]")  %>%
  inner_join(readxl::read_excel(paste0("/Users/afeurtey/Documents/Postdoc_Eva/Manuscripts/",
                                       "Accepted/Alice_Cecile_Comparative_genomics/",
                                       "Data_for_publication/Expression_data_Ztritici_inplanta.xlsx"), 
                                skip = 5, sheet = "IPO323_TPM")) %>%
  group_by(CHR, Start, End, Protein_ID) %>%
  mutate(Average_bio = mean(c(A_1, A_2, B_1, B_2), na.rm = T),
         Average_necro = mean(c(C_1, C_2, D_1, D_2)), na.rm = T,
         CHR = as.character(CHR)) %>%
  dplyr::select(CHR, Start, End, Protein_ID, Overlap, Average_bio, Average_necro) 

TPM_per_window = TPM %>%
  filter(Protein_ID != "Zt09_3_00018") %>%
  group_by(CHR, Start, End) %>%
  dplyr::summarize(TPM_bio = weighted.mean(x = Average_bio, w = Overlap),
                   TPM_necro = weighted.mean(x = Average_necro, w = Overlap)) 


#Getting all the data together
data = read_delim(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed"), 
                  col_names = c("CHR", "Start", "End")) %>%
full_join(read_delim(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab"), skip = 1,
         col_names = c("CHR", "Start", "End", "ATpercent", "GCpercent", "NbA", "NbC", "NbG", "NbT", 
                       "NbN", "NbOther", "Window_length"))) %>%
  dplyr::select(CHR, Start, End, GCpercent, Window_length) %>%
full_join(read_delim(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv"), 
         col_names = c("CHR", "Start", "End", "GC", "Product_index", "Substrate_index", "Composite_index"),
         na = c("", "nan"), delim = "\t"))  %>%
full_join(read_tsv(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab"), 
         col_names = c("CHR", "Start", "End", "Nb_overlapping_genes", "Coverage_bp_gene", "Window_length", "Coverage_frac_gene"))) %>%
full_join(read_tsv(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab"), 
         col_names = c("CHR", "Start", "End", "Nb_overlapping_TEs", "Coverage_bp_TE", "Window_length", "Coverage_frac_TE"))) %>%
full_join(read_delim(paste0(TE_RIP_dir, "Ztritici_global_March2021.good_samples.10kb_windows.TIP_counts.tab"),
                     col_types = c("c","d","c","d","d"))) %>%
full_join(read_tsv(paste0(vcf_dir, 
                 "Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab"),
          col_names = c("CHR", "Start", "Short_variant_number"), col_types = c("c","d","d")) %>% 
            mutate(Start = Start*10000)) %>%
  full_join(mappability) %>%
  full_join(chipseq) %>%
  full_join(TPM_per_window)

The mappability can be a strong issue to detect variants and in particular TIPs, so I want to remove windows with low mappability before doing the PCA.

#Threshold setting, visualizing and data filtering
threshold = 0.85
data_PCA = data %>% 
  unite(CHR, Start, col = Window, sep = ":") %>%
  filter(Window_length > 7500) %>%
  #filter(TIP_count < 150) %>%
  dplyr::select(Window, GC, Composite_index, Coverage_frac_gene, Coverage_frac_TE, 
                TIP_count, Short_variant_number, Mappability, 
                H3K27me3, H3K9me2, H3K4me2, TPM_bio, TPM_necro) 
data_PCA[is.na(data_PCA)] <- 0

data_PCA %>%
  ggplot(aes(Mappability)) + 
  geom_density(fill = "#EDE7E3", col ="#ADA7A9") + 
  geom_vline(xintercept = threshold, col = "#82c0cc") + 
  theme_light()

data_PCA2 = data_PCA %>%
  filter(Mappability > threshold) 


#First, density plots of pre and post filtering
pre_post_filter = bind_rows(data_PCA %>%
    pivot_longer(cols = -Window, names_to = "Measure", values_to = "Estimate") %>% 
    mutate(Windows = "All"),
          data_PCA2 %>%
      pivot_longer(cols = -Window, names_to = "Measure", values_to = "Estimate")  %>% 
      mutate(Windows = "Only mappable")) %>%
  filter(Measure != "Mappability") %>%
  group_by(Measure) %>%
  dplyr::mutate(Med = median(Estimate)) %>%
  mutate(Variable_category = case_when(str_detect(Measure, "H3") ~ "Epigenomics", 
                                       Measure == "Coverage_frac_TE" ~ "TE related",
                                       str_detect(Measure, "TPM") ~ "Gene related",
                                       Measure == "Coverage_frac_gene" ~ "Gene related",
                                       Measure %in% c("GC", "Composite_index") ~ "TE related",
                                       TRUE ~ "Variant counts")) 


#Violing and boxplot of the short variant and TIP counts
pre_post_filter %>%
  filter(Variable_category == "Variant counts") %>%
  ggplot(aes(y = Estimate, x = Windows, 
             fill = Windows, col = Windows)) +
    geom_violin(alpha = .4) + 
    geom_boxplot(alpha = .4, width = .2, outlier.shape = NA) + 
    theme_light() +
    theme(panel.grid.major.x = element_blank()) +
    labs(y = "Variant count per 10kb window", y = "") +
  scale_color_manual(values = c("grey", "#2FC6B7"))+
  scale_fill_manual(values = c("grey", "#2FC6B7")) + 
  facet_wrap(vars(Measure), scales = "free", ncol = 3)  

ggsave(paste0(fig_dir, "FigS2_variants_filtered_on_mappability.pdf"), width = 12, height = 12, units = "cm")

Some variables are particularly related to mappability. Here, I will compare the values between the high and low mappability windows.

#Boxplots for a subset of the variables

temp = data %>% 
  filter(!is.na(Mappability)) %>%
  mutate(Mappability = ifelse(Mappability < threshold, "Low", "High"))
  

#TE coverage
p1 = ggplot(temp, aes(x = Mappability, y = Coverage_frac_TE, col = Mappability, fill = Mappability)) +
     geom_boxplot() +
     theme_light()+
     scale_color_manual(values = c("#ADA7A9", "#ffa62b")) + 
     scale_fill_manual(values = c("#EDE7E3", "#FFD399"))

#H3K27me3 mark
 p2 = ggplot(temp, aes(x = Mappability, y = H3K27me3, col = Mappability, fill = Mappability)) +
      geom_boxplot(alpha = .4) +
      theme_light() +
      scale_color_manual(values = c("#ADA7A9", "#ffa62b")) + 
      scale_fill_manual(values = c("#EDE7E3", "#FFD399"))

#Gene coverage
p3 = ggplot(temp, aes(x = Mappability, y = Coverage_frac_gene, col = Mappability, fill = Mappability)) +
     geom_boxplot() +
     theme_light() +
     scale_color_manual(values = c("#ADA7A9", "#ffa62b")) + 
     scale_fill_manual(values = c("#EDE7E3", "#FFD399"))

#Composite index
p4 = data %>% 
  filter(!is.na(Mappability)) %>%
  mutate(Mappability = ifelse(Mappability < threshold, "Low", "High")) %>%
  ggplot(aes(x = Mappability, y = Composite_index, col = Mappability, fill = Mappability)) +
    geom_boxplot() +
    theme_light() +
  scale_color_manual(values = c("#ADA7A9", "#ffa62b")) + 
  scale_fill_manual(values = c("#EDE7E3", "#FFD399"))



 cowplot::plot_grid(p1 + theme(legend.position = "none", axis.title.x = element_blank()), 
                    p2 + theme(legend.position = "none", axis.title.x = element_blank()),
                    p3 + theme(legend.position = "none"), p4 + theme(legend.position = "none"),
                    rel_heights = c(1, 1, 1.25, 1.25))

ggsave(paste0(fig_dir, "FigS2_boxplots_high_low_mappability.pdf"), width = 12, height = 12, units = "cm")
temp = data_PCA2 %>% 
  dplyr::select(-Mappability) 

temp = as.matrix(temp[,c(2:ncol(temp))])[, which(apply(as.matrix(temp[,c(2:ncol(temp))]), 2, var) != 0)]

per_win.pca = prcomp(temp, center = TRUE,scale. = TRUE)

#Making custom PCA plot
arrows = as_tibble(as.data.frame(per_win.pca$rotation)) %>%
  mutate(label = rownames(per_win.pca$rotation))

temp = as.tibble(cbind(data_PCA2, as.data.frame(per_win.pca$x)))

for_plot = temp %>%
  separate(col = Window, into = c("CHR"), remove = F, extra = "drop", sep = ":") %>%
  mutate(CHR_type = ifelse(as.numeric(CHR) < 14, "Core", "Accessory"))
eigs <- per_win.pca$sdev^2

ggplot() + 
    geom_point(data = for_plot, aes(x = PC1, y = PC2, col = CHR_type), alpha = .6, shape = 1) + 
    theme_light() + 
    geom_segment(data = arrows, aes(x = 0, y = 0, xend = PC1*10, yend = PC2*10 ),
                 arrow = arrow(length = unit(0.03, "npc"))) +
    geom_text(data = arrows, aes(x = PC1*10, y = PC2*10, label = label)) + 
  coord_cartesian(xlim = c(-6, 10), ylim = c(-8, 7)) +
  scale_color_manual(values = c("#ffa62b", "#2FC6B7")) +
  labs(title = "PCA per 10kb windows", 
       subtitle = "Two first PCs",
       x = paste0("PC1 (", round(100*eigs[1]/sum(eigs), 1), "% explained var.)"),
       y = paste0("PC2 (", round(100*eigs[2]/sum(eigs), 1), "% explained var.)"))





Climatic check

temp = Zt_meta %>%
  #filter(!(ID_file %in% filtered_samples$ID_file)) %>%
  filter(!is.na(Longitude)) %>%
  mutate(X = as.numeric(unlist(Longitude)),
                            Y = as.numeric(unlist(Latitude))) %>%
  dplyr::select(X, Y) %>%
  distinct()

sp = SpatialPoints(temp[, c("X", "Y")])
summary(sp)
## Object of class SpatialPoints
## Coordinates:
##         min      max
## X -122.9300 175.6576
## Y  -46.2187  59.3294
## Is projected: NA 
## proj4string : [NA]
## Number of points: 307
bio_list = list()
for (i in c(1:length(Bioclim_var))) {
  file_name=paste0(data_dir,"Climatic_data/wc2.1_10m_bio/wc2.1_10m_bio_", i, ".tif")
  rast_temp = raster(file_name)    
  bio_list[[Bioclim_var[i]]] = raster::extract(rast_temp, sp)
}

climate_per_coordinates = cbind(temp, do.call(cbind, bio_list))

dat = TE_qty %>%
  full_join(., climate_per_coordinates,
                by= c("Longitude" = "X", "Latitude" = "Y"))  %>% 
    filter(!is.na(Continent))%>% filter(!is.na(Collection)) %>% filter(!is.na(Cluster)) %>%
  filter(!is.na(Total_insertions))
names(dat)
##  [1] "ID_file"                                             
##  [2] "Non_ref_insertions"                                  
##  [3] "Ref_insertions"                                      
##  [4] "Total_insertions"                                    
##  [5] "Continent"                                           
##  [6] "Country"                                             
##  [7] "Latitude"                                            
##  [8] "Longitude"                                           
##  [9] "Latitude2"                                           
## [10] "Longitude2"                                          
## [11] "Sampling_Date"                                       
## [12] "Collection"                                          
## [13] "Cluster"                                             
## [14] "Annual Mean Temperature"                             
## [15] "Mean Diurnal Range "                                 
## [16] "Isothermality (BIO2/BIO7) (×100)"                    
## [17] "Temperature Seasonality (standard deviation ×100)"   
## [18] "Max Temperature of Warmest Month"                    
## [19] "Min Temperature of Coldest Month"                    
## [20] "Temperature Annual Range (BIO5-BIO6)"                
## [21] "Mean Temperature of Wettest Quarter"                 
## [22] "Mean Temperature of Driest Quarter"                  
## [23] "Mean Temperature of Warmest Quarter"                 
## [24] "Mean Temperature of Coldest Quarter"                 
## [25] "Annual Precipitation"                                
## [26] "Precipitation of Wettest Month"                      
## [27] "Precipitation of Driest Month"                       
## [28] "Precipitation Seasonality (Coefficient of Variation)"
## [29] "Precipitation of Wettest Quarter"                    
## [30] "Precipitation of Driest Quarter"                     
## [31] "Precipitation of Warmest Quarter"                    
## [32] "Precipitation of Coldest Quarter"
dat %>%
  filter(abs(Latitude) > 20) %>%
  ggplot(aes(abs(Latitude), Total_insertions)) +
    theme_light()  +
    geom_point(aes(color = Cluster, shape = Cluster)) + 
    Color_Cluster + Shape_Cluster  + 
    geom_smooth(color = "grey20", method = "lm", se =F) +
   stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 650) +
   stat_regline_equation(label.y = 610)

dat %>%
  filter(abs(Latitude) > 20) %>%
  ggplot(aes(abs(Latitude), Total_insertions)) +
    theme_light()  +
    geom_point(aes(color = Cluster, shape = Cluster)) + 
    geom_smooth(aes(color = Cluster), method = "lm", se =F) +
    Color_Cluster + Shape_Cluster  +
    facet_wrap(vars(Cluster))+
   stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 160) +
   stat_regline_equation(label.y = 100)

model_climate = lm(Total_insertions ~  Collection + Cluster + Latitude,
          data=dat)
summary(model_climate)
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Latitude, 
##     data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.333  -21.702   -1.209   18.948  241.381 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      250.8555     8.8272  28.418  < 2e-16 ***
## CollectionETHZ_2020              102.3821     6.5211  15.700  < 2e-16 ***
## CollectionHartmann_FstQst_2015     1.1524     6.3930   0.180 0.856990    
## CollectionHartmann_Oregon_2016   180.9941     7.8124  23.168  < 2e-16 ***
## CollectionINRAE_BIOGER            67.4046     5.1370  13.121  < 2e-16 ***
## CollectionJGI                     48.1781     7.9148   6.087 1.61e-09 ***
## CollectionJGI_2                   39.5121     7.9700   4.958 8.32e-07 ***
## CollectionJGI_3                   54.7530     9.5907   5.709 1.48e-08 ***
## CollectionJGI_4                   82.5057    21.0920   3.912 9.75e-05 ***
## CollectionJGI_5                   70.6836     7.4022   9.549  < 2e-16 ***
## CollectionMMcDonald_2018         171.3117    17.0885  10.025  < 2e-16 ***
## CollectionSyngenta               211.1625     4.3000  49.108  < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.7386    10.0172  -2.170 0.030221 *  
## ClusterV1                        -31.7350     9.6639  -3.284 0.001058 ** 
## ClusterV10                        80.3299     6.5783  12.211  < 2e-16 ***
## ClusterV11                       -62.8027     7.6789  -8.179 8.24e-16 ***
## ClusterV2                         28.9759     5.6946   5.088 4.28e-07 ***
## ClusterV3                          1.9224    10.1270   0.190 0.849480    
## ClusterV4                         96.5048    15.4650   6.240 6.33e-10 ***
## ClusterV5                         38.5572    14.0331   2.748 0.006107 ** 
## ClusterV6                        -48.9305     7.7032  -6.352 3.16e-10 ***
## ClusterV7                        -39.0489    10.3165  -3.785 0.000162 ***
## ClusterV8                         66.7578     9.8909   6.749 2.45e-11 ***
## ClusterV9                         56.9904    14.8753   3.831 0.000135 ***
## Latitude                           0.4609     0.1748   2.636 0.008505 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.83 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8707, Adjusted R-squared:  0.8678 
## F-statistic: 294.2 on 24 and 1048 DF,  p-value: < 2.2e-16
dat2 = dat %>% 
  pivot_longer(cols = -c(ID_file, Non_ref_insertions, Ref_insertions, Total_insertions,
   Continent, Country, Sampling_Date, Collection, Cluster), 
   names_to = "Variable", values_to = "Estimate_variable")


short_list_bioclim = c("Annual Mean Temperature", "Mean Diurnal Range ", "Temperature Seasonality (standard deviation ×100)" ,
                       "Annual Precipitation", "Precipitation Seasonality (Coefficient of Variation)")

for (var_of_interest in Bioclim_var){
model_climate = lm(Total_insertions ~  Collection + Cluster + Estimate_variable,
          data=filter(dat2, Variable == var_of_interest))
temp = summary(model_climate)

if (temp$coefficients['Estimate_variable',4] <= 0.05){
  print(var_of_interest)
  print(temp)
  
  
print(dat2 %>% 
  filter(Variable == var_of_interest) %>%
  ggplot(aes(Estimate_variable, Total_insertions)) +
    theme_light()  +
    geom_point(aes(color = Cluster, shape = Cluster)) + 
    Color_Cluster + Shape_Cluster  + 
    geom_smooth(color = "grey20", method = "lm", se =F) +
   stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 150) +
   stat_regline_equation(label.y = 110) +
   labs( x = str_wrap(var_of_interest, 40)))

    
print(dat2 %>% 
  filter(Variable == var_of_interest) %>%
  ggplot(aes(Estimate_variable, Total_insertions)) +
    theme_light()  +
    geom_point(aes(color = Cluster, shape = Cluster)) + 
    geom_smooth(aes(color = Cluster), method = "lm", se =F) +
    Color_Cluster + Shape_Cluster  +
    facet_wrap(vars(Cluster))+
   stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 160) +
   stat_regline_equation(label.y = 100) +
   labs( x = str_wrap(var_of_interest, 40)))
}
}
## [1] "Annual Mean Temperature"
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable, 
##     data = filter(dat2, Variable == var_of_interest))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.892  -20.997   -1.225   19.077  242.636 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      309.7700     9.9228  31.218  < 2e-16 ***
## CollectionETHZ_2020              108.8584     6.5810  16.541  < 2e-16 ***
## CollectionHartmann_FstQst_2015     1.4096     6.3204   0.223 0.823566    
## CollectionHartmann_Oregon_2016   180.9727     7.7207  23.440  < 2e-16 ***
## CollectionINRAE_BIOGER            75.5250     5.2593  14.360  < 2e-16 ***
## CollectionJGI                     51.6194     7.8460   6.579 7.46e-11 ***
## CollectionJGI_2                   45.6106     7.9592   5.731 1.31e-08 ***
## CollectionJGI_3                   60.9234     9.5308   6.392 2.46e-10 ***
## CollectionJGI_4                   92.4521    20.8971   4.424 1.07e-05 ***
## CollectionJGI_5                   79.0263     7.4937  10.546  < 2e-16 ***
## CollectionMMcDonald_2018         130.7912    10.0008  13.078  < 2e-16 ***
## CollectionSyngenta               216.0019     4.2720  50.562  < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.9028     9.8748  -2.218 0.026765 *  
## ClusterV1                        -25.3970     9.5933  -2.647 0.008234 ** 
## ClusterV10                        83.9475     6.3368  13.248  < 2e-16 ***
## ClusterV11                       -49.3068     7.7771  -6.340 3.41e-10 ***
## ClusterV2                         24.2764     5.5203   4.398 1.21e-05 ***
## ClusterV3                          2.6154    10.0136   0.261 0.794003    
## ClusterV4                         79.1163     9.6269   8.218 6.05e-16 ***
## ClusterV5                         15.0175     7.8373   1.916 0.055617 .  
## ClusterV6                        -22.3153     9.0856  -2.456 0.014206 *  
## ClusterV7                        -20.0044    10.7381  -1.863 0.062752 .  
## ClusterV8                         40.7932    11.1182   3.669 0.000256 ***
## ClusterV9                         32.0229    10.0119   3.198 0.001423 ** 
## Estimate_variable                 -3.6879     0.6603  -5.585 2.98e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.43 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8737, Adjusted R-squared:  0.8708 
## F-statistic: 301.9 on 24 and 1048 DF,  p-value: < 2.2e-16

## [1] "Mean Diurnal Range "
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable, 
##     data = filter(dat2, Variable == var_of_interest))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.195  -21.939   -0.956   19.106  237.323 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       296.052     12.768  23.188  < 2e-16 ***
## CollectionETHZ_2020               103.876      6.614  15.706  < 2e-16 ***
## CollectionHartmann_FstQst_2015      3.145      6.429   0.489  0.62480    
## CollectionHartmann_Oregon_2016    183.125      7.827  23.397  < 2e-16 ***
## CollectionINRAE_BIOGER             67.695      5.135  13.184  < 2e-16 ***
## CollectionJGI                      48.938      7.917   6.181 9.08e-10 ***
## CollectionJGI_2                    36.582      7.865   4.651 3.72e-06 ***
## CollectionJGI_3                    54.278      9.597   5.656 2.00e-08 ***
## CollectionJGI_4                    84.512     21.083   4.008 6.54e-05 ***
## CollectionJGI_5                    70.881      7.406   9.571  < 2e-16 ***
## CollectionMMcDonald_2018          133.414     10.104  13.204  < 2e-16 ***
## CollectionSyngenta                212.296      4.273  49.684  < 2e-16 ***
## CollectionUnine_third_batch_2018  -21.209     10.002  -2.121  0.03419 *  
## ClusterV1                         -27.558      9.728  -2.833  0.00470 ** 
## ClusterV10                         87.593      6.535  13.404  < 2e-16 ***
## ClusterV11                        -59.811      7.616  -7.853 1.00e-14 ***
## ClusterV2                          29.146      5.672   5.139 3.30e-07 ***
## ClusterV3                           4.468     10.181   0.439  0.66087    
## ClusterV4                          70.290      9.645   7.288 6.19e-13 ***
## ClusterV5                           6.967      7.827   0.890  0.37360    
## ClusterV6                         -49.810      7.692  -6.476 1.45e-10 ***
## ClusterV7                         -33.822     10.486  -3.226  0.00130 ** 
## ClusterV8                          78.113      9.618   8.122 1.28e-15 ***
## ClusterV9                          35.744     10.502   3.404  0.00069 ***
## Estimate_variable                  -2.877      1.094  -2.630  0.00867 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.83 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8707, Adjusted R-squared:  0.8678 
## F-statistic: 294.2 on 24 and 1048 DF,  p-value: < 2.2e-16

## [1] "Isothermality (BIO2/BIO7) (×100)"
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable, 
##     data = filter(dat2, Variable == var_of_interest))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -142.654  -21.634   -1.384   19.685  242.398 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      284.1170     9.8762  28.768  < 2e-16 ***
## CollectionETHZ_2020              101.7464     6.5097  15.630  < 2e-16 ***
## CollectionHartmann_FstQst_2015     4.0133     6.5037   0.617 0.537318    
## CollectionHartmann_Oregon_2016   185.5921     7.9997  23.200  < 2e-16 ***
## CollectionINRAE_BIOGER            69.8230     5.2117  13.397  < 2e-16 ***
## CollectionJGI                     49.4529     7.9342   6.233 6.63e-10 ***
## CollectionJGI_2                   38.9554     7.9703   4.888 1.18e-06 ***
## CollectionJGI_3                   55.2100     9.5960   5.753 1.15e-08 ***
## CollectionJGI_4                   85.9543    21.1148   4.071 5.04e-05 ***
## CollectionJGI_5                   72.4344     7.4872   9.674  < 2e-16 ***
## CollectionMMcDonald_2018         140.8752    10.4263  13.512  < 2e-16 ***
## CollectionSyngenta               214.1764     4.3435  49.310  < 2e-16 ***
## CollectionUnine_third_batch_2018 -20.3452     9.9945  -2.036 0.042038 *  
## ClusterV1                        -30.2209     9.6659  -3.127 0.001817 ** 
## ClusterV10                        83.5069     6.4224  13.002  < 2e-16 ***
## ClusterV11                       -60.6150     7.6235  -7.951 4.76e-15 ***
## ClusterV2                         32.2845     5.3677   6.015 2.49e-09 ***
## ClusterV3                          1.8548    10.1354   0.183 0.854831    
## ClusterV4                         65.0799     9.3647   6.950 6.43e-12 ***
## ClusterV5                          9.6724     7.8675   1.229 0.219191    
## ClusterV6                        -48.3635     7.7344  -6.253 5.85e-10 ***
## ClusterV7                        -41.6691    10.4033  -4.005 6.63e-05 ***
## ClusterV8                         67.7641     9.8979   6.846 1.29e-11 ***
## ClusterV9                         35.3606    10.5887   3.339 0.000869 ***
## Estimate_variable                 -0.4630     0.2035  -2.276 0.023071 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.86 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8705, Adjusted R-squared:  0.8676 
## F-statistic: 293.6 on 24 and 1048 DF,  p-value: < 2.2e-16

## [1] "Max Temperature of Warmest Month"
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable, 
##     data = filter(dat2, Variable == var_of_interest))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.961  -21.516   -1.078   18.673  235.270 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      326.4286    16.1775  20.178  < 2e-16 ***
## CollectionETHZ_2020              106.4341     6.6256  16.064  < 2e-16 ***
## CollectionHartmann_FstQst_2015     0.2957     6.3709   0.046   0.9630    
## CollectionHartmann_Oregon_2016   178.8620     7.8069  22.911  < 2e-16 ***
## CollectionINRAE_BIOGER            69.9151     5.1387  13.606  < 2e-16 ***
## CollectionJGI                     49.7920     7.8883   6.312 4.06e-10 ***
## CollectionJGI_2                   37.2500     7.8350   4.754 2.27e-06 ***
## CollectionJGI_3                   58.9397     9.5879   6.147 1.12e-09 ***
## CollectionJGI_4                   87.4284    21.0088   4.162 3.42e-05 ***
## CollectionJGI_5                   72.3892     7.3913   9.794  < 2e-16 ***
## CollectionMMcDonald_2018         120.9225    10.6396  11.365  < 2e-16 ***
## CollectionSyngenta               212.1185     4.2551  49.850  < 2e-16 ***
## CollectionUnine_third_batch_2018 -20.6013     9.9395  -2.073   0.0384 *  
## ClusterV1                        -24.4337     9.7408  -2.508   0.0123 *  
## ClusterV10                        88.2744     6.4612  13.662  < 2e-16 ***
## ClusterV11                       -53.2513     7.7789  -6.846 1.29e-11 ***
## ClusterV2                         25.7476     5.6958   4.520 6.87e-06 ***
## ClusterV3                          4.0718    10.1010   0.403   0.6870    
## ClusterV4                         74.5656     9.6724   7.709 2.93e-14 ***
## ClusterV5                          6.8292     7.7903   0.877   0.3809    
## ClusterV6                        -40.4657     8.0277  -5.041 5.46e-07 ***
## ClusterV7                        -18.2214    11.4862  -1.586   0.1130    
## ClusterV8                         69.7670     9.5148   7.332 4.51e-13 ***
## ClusterV9                         25.8505    10.0770   2.565   0.0104 *  
## Estimate_variable                 -2.2004     0.5506  -3.996 6.88e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.68 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8718, Adjusted R-squared:  0.8689 
## F-statistic: 297.1 on 24 and 1048 DF,  p-value: < 2.2e-16

## [1] "Min Temperature of Coldest Month"
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable, 
##     data = filter(dat2, Variable == var_of_interest))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -142.583  -21.982   -1.591   20.106  244.394 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      264.5865     6.4071  41.296  < 2e-16 ***
## CollectionETHZ_2020              101.7783     6.4808  15.705  < 2e-16 ***
## CollectionHartmann_FstQst_2015     4.0480     6.4391   0.629  0.52971    
## CollectionHartmann_Oregon_2016   186.5699     7.9457  23.480  < 2e-16 ***
## CollectionINRAE_BIOGER            73.5067     5.4278  13.543  < 2e-16 ***
## CollectionJGI                     49.0130     7.9049   6.200 8.09e-10 ***
## CollectionJGI_2                   41.3326     8.0235   5.151 3.09e-07 ***
## CollectionJGI_3                   57.6884     9.6005   6.009 2.57e-09 ***
## CollectionJGI_4                   89.9914    21.1309   4.259 2.24e-05 ***
## CollectionJGI_5                   74.7044     7.5399   9.908  < 2e-16 ***
## CollectionMMcDonald_2018         139.2740    10.1638  13.703  < 2e-16 ***
## CollectionSyngenta               215.9113     4.4038  49.029  < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.0476     9.9762  -2.110  0.03511 *  
## ClusterV1                        -29.1681     9.6531  -3.022  0.00258 ** 
## ClusterV10                        82.7477     6.4167  12.896  < 2e-16 ***
## ClusterV11                       -54.5472     7.8086  -6.986 5.03e-12 ***
## ClusterV2                         33.3031     5.2612   6.330 3.63e-10 ***
## ClusterV3                          0.7259    10.1164   0.072  0.94281    
## ClusterV4                         67.3669     9.3906   7.174 1.38e-12 ***
## ClusterV5                         14.5665     8.0902   1.801  0.07207 .  
## ClusterV6                        -39.2917     8.3981  -4.679 3.27e-06 ***
## ClusterV7                        -37.0792    10.3142  -3.595  0.00034 ***
## ClusterV8                         48.6452    12.4076   3.921 9.41e-05 ***
## ClusterV9                         33.1502    10.2071   3.248  0.00120 ** 
## Estimate_variable                 -1.3618     0.4287  -3.176  0.00154 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.78 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8711, Adjusted R-squared:  0.8682 
## F-statistic: 295.2 on 24 and 1048 DF,  p-value: < 2.2e-16

## [1] "Mean Temperature of Driest Quarter"
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable, 
##     data = filter(dat2, Variable == var_of_interest))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -144.556  -21.372   -1.701   20.106  245.387 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      274.7080     6.6978  41.015  < 2e-16 ***
## CollectionETHZ_2020              102.5593     6.4834  15.819  < 2e-16 ***
## CollectionHartmann_FstQst_2015     5.7385     6.4849   0.885 0.376419    
## CollectionHartmann_Oregon_2016   189.0828     8.0420  23.512  < 2e-16 ***
## CollectionINRAE_BIOGER            74.9882     5.4789  13.687  < 2e-16 ***
## CollectionJGI                     48.3689     7.8904   6.130 1.24e-09 ***
## CollectionJGI_2                   40.0543     7.9141   5.061 4.92e-07 ***
## CollectionJGI_3                   62.5016     9.7539   6.408 2.23e-10 ***
## CollectionJGI_4                   92.7866    21.1503   4.387 1.27e-05 ***
## CollectionJGI_5                   74.6322     7.4874   9.968  < 2e-16 ***
## CollectionMMcDonald_2018         137.2590    10.0764  13.622  < 2e-16 ***
## CollectionSyngenta               217.3042     4.4623  48.698  < 2e-16 ***
## CollectionUnine_third_batch_2018 -19.5326     9.9470  -1.964 0.049833 *  
## ClusterV1                        -28.1976     9.6503  -2.922 0.003553 ** 
## ClusterV10                        83.1448     6.3966  12.998  < 2e-16 ***
## ClusterV11                       -52.9067     7.8489  -6.741 2.60e-11 ***
## ClusterV2                         28.4470     5.5132   5.160 2.96e-07 ***
## ClusterV3                          0.4401    10.1020   0.044 0.965257    
## ClusterV4                         69.5691     9.4397   7.370 3.46e-13 ***
## ClusterV5                          5.5546     7.8207   0.710 0.477710    
## ClusterV6                        -41.6432     8.0078  -5.200 2.39e-07 ***
## ClusterV7                        -27.3351    10.7490  -2.543 0.011132 *  
## ClusterV8                         56.6657    10.5947   5.348 1.09e-07 ***
## ClusterV9                         33.6045    10.1797   3.301 0.000995 ***
## Estimate_variable                 -0.7754     0.2121  -3.657 0.000268 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.72 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8715, Adjusted R-squared:  0.8686 
## F-statistic: 296.2 on 24 and 1048 DF,  p-value: < 2.2e-16

## [1] "Mean Temperature of Warmest Quarter"
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable, 
##     data = filter(dat2, Variable == var_of_interest))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.481  -21.593   -1.068   18.377  237.588 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      328.1999    14.7150  22.304  < 2e-16 ***
## CollectionETHZ_2020              107.5958     6.6214  16.250  < 2e-16 ***
## CollectionHartmann_FstQst_2015    -2.6261     6.4081  -0.410  0.68203    
## CollectionHartmann_Oregon_2016   174.2193     7.9232  21.988  < 2e-16 ***
## CollectionINRAE_BIOGER            69.5798     5.1137  13.606  < 2e-16 ***
## CollectionJGI                     50.7252     7.8772   6.439 1.82e-10 ***
## CollectionJGI_2                   38.8089     7.8326   4.955 8.44e-07 ***
## CollectionJGI_3                   60.0291     9.5761   6.269 5.31e-10 ***
## CollectionJGI_4                   87.2215    20.9515   4.163 3.40e-05 ***
## CollectionJGI_5                   73.4438     7.3869   9.942  < 2e-16 ***
## CollectionMMcDonald_2018         118.5501    10.6319  11.150  < 2e-16 ***
## CollectionSyngenta               211.8242     4.2459  49.889  < 2e-16 ***
## CollectionUnine_third_batch_2018 -19.6465     9.9104  -1.982  0.04769 *  
## ClusterV1                        -25.7297     9.6508  -2.666  0.00779 ** 
## ClusterV10                        86.1769     6.3798  13.508  < 2e-16 ***
## ClusterV11                       -51.8024     7.7796  -6.659 4.45e-11 ***
## ClusterV2                         25.2824     5.6069   4.509 7.24e-06 ***
## ClusterV3                          3.4890    10.0661   0.347  0.72896    
## ClusterV4                         77.2179     9.7139   7.949 4.83e-15 ***
## ClusterV5                          6.8008     7.7702   0.875  0.38165    
## ClusterV6                        -32.1168     8.5763  -3.745  0.00019 ***
## ClusterV7                        -14.3287    11.5361  -1.242  0.21449    
## ClusterV8                         65.4204     9.6151   6.804 1.71e-11 ***
## ClusterV9                         21.1809    10.1499   2.087  0.03715 *  
## Estimate_variable                 -3.0424     0.6599  -4.610 4.52e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.59 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8725, Adjusted R-squared:  0.8696 
## F-statistic: 298.8 on 24 and 1048 DF,  p-value: < 2.2e-16

## [1] "Mean Temperature of Coldest Quarter"
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Estimate_variable, 
##     data = filter(dat2, Variable == var_of_interest))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -143.14  -21.13   -1.37   20.01  244.63 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      271.9332     6.4656  42.058  < 2e-16 ***
## CollectionETHZ_2020              104.0141     6.5117  15.973  < 2e-16 ***
## CollectionHartmann_FstQst_2015     4.5120     6.4136   0.704 0.481900    
## CollectionHartmann_Oregon_2016   186.6368     7.8712  23.711  < 2e-16 ***
## CollectionINRAE_BIOGER            74.7352     5.3933  13.857  < 2e-16 ***
## CollectionJGI                     50.4771     7.8971   6.392 2.46e-10 ***
## CollectionJGI_2                   43.8724     8.0667   5.439 6.68e-08 ***
## CollectionJGI_3                   58.6113     9.5799   6.118 1.34e-09 ***
## CollectionJGI_4                   91.6710    21.0750   4.350 1.50e-05 ***
## CollectionJGI_5                   76.7822     7.5629  10.152  < 2e-16 ***
## CollectionMMcDonald_2018         140.2431    10.1302  13.844  < 2e-16 ***
## CollectionSyngenta               216.6828     4.3836  49.430  < 2e-16 ***
## CollectionUnine_third_batch_2018 -21.2005     9.9446  -2.132 0.033251 *  
## ClusterV1                        -28.1481     9.6348  -2.921 0.003558 ** 
## ClusterV10                        82.7662     6.3920  12.948  < 2e-16 ***
## ClusterV11                       -53.4135     7.7683  -6.876 1.06e-11 ***
## ClusterV2                         30.7678     5.3213   5.782 9.73e-09 ***
## ClusterV3                          1.3566    10.0835   0.135 0.893007    
## ClusterV4                         70.4716     9.4431   7.463 1.78e-13 ***
## ClusterV5                         15.5652     8.0208   1.941 0.052574 .  
## ClusterV6                        -35.3874     8.4900  -4.168 3.32e-05 ***
## ClusterV7                        -35.4433    10.3048  -3.439 0.000606 ***
## ClusterV8                         45.5873    11.8243   3.855 0.000123 ***
## ClusterV9                         35.8864    10.2399   3.505 0.000477 ***
## Estimate_variable                 -1.7137     0.4274  -4.010 6.50e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.68 on 1048 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8719, Adjusted R-squared:  0.8689 
## F-statistic: 297.1 on 24 and 1048 DF,  p-value: < 2.2e-16

Repeat-Induced Point mutation

RIP per genetic cluster

RIP composite median over TEs

I now look at the repeat-induced point mutations in reads that map on the different TE consensus. We expect to see differences in the different geographical groups so I start by visualizing this.

#while read p; do fichier_list=$(ls -1 /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/${p}\.*txt) ; for fichier in $fichier_list; do short_name=$(echo $fichier | cut -f 8 -d "/" | cut -f 2 -d "." ) ; comp=$(grep Composite $fichier) ; echo $p $short_name $comp ;  done; done < Keep_lists_samples/Ztritici_global_March2021.genotyped.good_samples.args > /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt 
reads_per_TE = read_delim(paste0(RIP_DIR, "Nb_reads_per_TE.txt"), delim = "\t",
                    col_names = c("ID_file", "TE", "Length",
                                  "# mapped read-segments",  "# unmapped read-segments")) %>%
  filter(TE != "*") %>%
  separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)")) %>%
  left_join(Zt_meta %>% dplyr::select(ID_file, Collection, Country, Continent)) %>%
  unite(Continent, Country, ID_file, col = "for_display", remove = F)

temp = reads_per_TE %>% group_by(ID_file) %>%
       dplyr::summarise(Reads_mapped_per_TE = sum(`# mapped read-segments`))

reads_per_TE = left_join(reads_per_TE, temp) %>%
  dplyr::mutate(Normalized_nb_reads_mapped = `# mapped read-segments` / Reads_mapped_per_TE)


RIP=read_delim(paste0(RIP_DIR, "Composite_index.txt"),
             col_names = c("ID_file", "TE",  "Variable", "Composite_median", "Composite_mean" ), 
             delim = " ", na = c("nan", "NA", "")) %>%
  separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)"))%>%
  left_join(Zt_meta %>% dplyr::select(Continent, Cluster, Country, Collection, ID_file)) %>%
  unite(Continent, Country, ID_file, col = "for_display", remove = F) %>%
  left_join(., reads_per_TE)
#DONE: merge with reads_per_TE to be able to filter the points that have too few reads mapped!


#Per cluster
world_RIP_avg <-
  RIP %>%
  filter(TE == "RIP_est") %>%
  group_by(Cluster) %>%
  dplyr::summarize(avg = mean(as.numeric(Composite_median), na.rm = T)) %>%
  ungroup() %>% 
  dplyr::summarize(avg = mean(as.numeric(avg), na.rm = T)) %>%
  pull(avg)

ordering_table = tibble(Cluster = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"),
                        Order_tree = c(6,9,8,7,5,2,1,10,4,11,3))
temp = RIP %>%
  filter(TE == "RIP_est") %>%
  filter(!is.na(Cluster)) %>%
  group_by(Cluster) %>%
  dplyr::mutate(region_avg = mean(as.numeric(Composite_median), na.rm = T)) %>%
  left_join(., ordering_table) %>%
  arrange(Order_tree) 

RIP_plot = ggplot(temp, aes(x = reorder(Cluster, -Order_tree),
                            y = as.numeric(Composite_median),
                            color = Cluster))   +
  coord_flip() +
  geom_segment(aes(x = reorder(Cluster, -region_avg), xend = reorder(Cluster, -region_avg),
        y = world_RIP_avg, yend = region_avg), size = 0.8) +
  geom_jitter(size = 1.5, alpha = 0.2, width = 0.2) +
  geom_hline(aes(yintercept = world_RIP_avg), color = "gray70", size = 0.6) +
  stat_summary(fun = mean, geom = "point", size = 5) +
  Color_Cluster +
  theme_light() +
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = "RIP composite index",
        title = "RIP levels per Cluster",
        subtitle = str_wrap(paste("The RIP levels in reads mapping on TE consensus",
                                  "are high in the Middle-East",
                                  "and low in North America in particular."), width = 70))


#Statistical test
#One-way ANOVA with blocks
##Define linear model
model = lm(as.numeric(Composite_median) ~ Cluster + Collection ,
          data=temp)
Anova(model,type = "II")  
## Anova Table (Type II tests)
## 
## Response: as.numeric(Composite_median)
##             Sum Sq  Df F value    Pr(>F)    
## Cluster    19.7728  10 320.237 < 2.2e-16 ***
## Collection  6.5833  13  82.016 < 2.2e-16 ***
## Residuals   5.9521 964                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
marginal = lsmeans(model, ~ Cluster)
pairs(marginal, adjust="sidak")
##  contrast   estimate     SE  df t.ratio p.value
##  V1 - V10   0.133632 0.0553 964   2.417  0.5842
##  V1 - V11  -0.491634 0.0573 964  -8.584  <.0001
##  V1 - V2   -0.084395 0.0557 964  -1.516  0.9995
##  V1 - V3    0.003394 0.0188 964   0.181  1.0000
##  V1 - V4    0.091778 0.0524 964   1.752  0.9899
##  V1 - V5   -0.083909 0.0574 964  -1.463  0.9998
##  V1 - V6   -0.556971 0.0560 964  -9.942  <.0001
##  V1 - V7   -0.375493 0.0595 964  -6.312  <.0001
##  V1 - V8    0.058906 0.0585 964   1.007  1.0000
##  V1 - V9    0.034169 0.0590 964   0.579  1.0000
##  V10 - V11 -0.625266 0.0180 964 -34.793  <.0001
##  V10 - V2  -0.218027 0.0133 964 -16.342  <.0001
##  V10 - V3  -0.130238 0.0584 964  -2.230  0.7647
##  V10 - V4  -0.041854 0.0177 964  -2.368  0.6334
##  V10 - V5  -0.217541 0.0179 964 -12.167  <.0001
##  V10 - V6  -0.690603 0.0151 964 -45.589  <.0001
##  V10 - V7  -0.509125 0.0241 964 -21.137  <.0001
##  V10 - V8  -0.074726 0.0215 964  -3.479  0.0285
##  V10 - V9  -0.099463 0.0227 964  -4.389  0.0007
##  V11 - V2   0.407239 0.0168 964  24.193  <.0001
##  V11 - V3   0.495028 0.0603 964   8.212  <.0001
##  V11 - V4   0.583412 0.0231 964  25.202  <.0001
##  V11 - V5   0.407725 0.0203 964  20.069  <.0001
##  V11 - V6  -0.065337 0.0202 964  -3.236  0.0666
##  V11 - V7   0.116141 0.0263 964   4.412  0.0006
##  V11 - V8   0.550541 0.0239 964  23.033  <.0001
##  V11 - V9   0.525803 0.0247 964  21.270  <.0001
##  V2 - V3    0.087789 0.0588 964   1.494  0.9997
##  V2 - V4    0.176173 0.0189 964   9.345  <.0001
##  V2 - V5    0.000486 0.0181 964   0.027  1.0000
##  V2 - V6   -0.472576 0.0164 964 -28.839  <.0001
##  V2 - V7   -0.291098 0.0236 964 -12.354  <.0001
##  V2 - V8    0.143301 0.0216 964   6.648  <.0001
##  V2 - V9    0.118564 0.0229 964   5.180  <.0001
##  V3 - V4    0.088384 0.0557 964   1.588  0.9986
##  V3 - V5   -0.087303 0.0604 964  -1.446  0.9999
##  V3 - V6   -0.560366 0.0591 964  -9.482  <.0001
##  V3 - V7   -0.378887 0.0624 964  -6.073  <.0001
##  V3 - V8    0.055512 0.0615 964   0.903  1.0000
##  V3 - V9    0.030774 0.0619 964   0.497  1.0000
##  V4 - V5   -0.175687 0.0234 964  -7.524  <.0001
##  V4 - V6   -0.648749 0.0199 964 -32.663  <.0001
##  V4 - V7   -0.467271 0.0282 964 -16.578  <.0001
##  V4 - V8   -0.032871 0.0261 964  -1.261  1.0000
##  V4 - V9   -0.057609 0.0272 964  -2.119  0.8537
##  V5 - V6   -0.473062 0.0201 964 -23.547  <.0001
##  V5 - V7   -0.291584 0.0261 964 -11.152  <.0001
##  V5 - V8    0.142815 0.0237 964   6.026  <.0001
##  V5 - V9    0.118078 0.0243 964   4.850  0.0001
##  V6 - V7    0.181478 0.0259 964   7.012  <.0001
##  V6 - V8    0.615878 0.0236 964  26.055  <.0001
##  V6 - V9    0.591140 0.0244 964  24.220  <.0001
##  V7 - V8    0.434400 0.0283 964  15.371  <.0001
##  V7 - V9    0.409662 0.0296 964  13.853  <.0001
##  V8 - V9   -0.024738 0.0275 964  -0.900  1.0000
## 
## Results are averaged over the levels of: Collection 
## Note: contrasts are still on the as.numeric scale 
## P value adjustment: sidak method for 55 tests
CLD_RIP = cld(marginal,
          alpha   = 0.05,
          Letters = letters,  ### Use lower-case letters for .group
          adjust  = "sidak")  ### sidak-adjusted p-values
CLD_RIP$.group=gsub(" ", "", CLD_RIP$.group)
text_y = (max(as.numeric(temp$Composite_median), na.rm = T) + 0.1*max(as.numeric(temp$Composite_median), na.rm = T))
RIP_plot +
  geom_text(data = CLD_RIP, aes(x = Cluster,
                            y = text_y,
                            label = .group), color = "black")

The RIP index does seem consistent with what Cécile has found, with higher RIP in the Middle-East and African populations and lower in the rest (in particular North America and Oceania).

As this pattern matches with the pattern of TE per cluster, I look at the relation between the amount of reads mapping on TE consensus and the level of RIP detected.

TE_RIP = inner_join(TE_qty, RIP %>% filter(TE == "RIP_est") %>% dplyr::select(-Cluster)) %>%
  inner_join(read_delim(paste0(RIP_DIR, "Nb_reads.txt"), delim = " ") %>%
  dplyr::filter(Total_reads > Te_aligned_reads) %>%
  dplyr::mutate(Percent_TE_Reads = Te_aligned_reads * 100 / Total_reads))

TE_RIP$Sampling_Date[is.na(TE_RIP$Sampling_Date)] <- "Unknown"


# RIP vs TIPS
p1 = TE_RIP %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  ggplot(aes(as.numeric(Total_insertions), as.numeric(Composite_median),
                          color = Cluster, shape = Cluster))+
    theme_light()  +
    geom_point() + 
    Color_Cluster3 + Shape_Cluster2 + 
    labs(color = "Genetic cluster",
         x = "TE insertion number", y = "RIP composite median",
         subtitle = "Without the OG Hartmann collection") 
p2 = TE_RIP %>%
  ggplot(aes(as.numeric(Total_insertions), as.numeric(Composite_median),
                          color = Cluster, shape = Cluster))+
    theme_light()  +
    geom_point() + 
    Color_Cluster3 + Shape_Cluster2 + 
    labs(color = "Genetic cluster",
         x = "TE insertion number", y = "RIP composite median",
         subtitle = "With the OG Hartmann collection") 

ligne = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2+ theme(legend.position = "none"))

title <- ggdraw() + 
  draw_label("TE quantity vs RIP levels", fontface = 'bold',
    x = 0, hjust = 0) + theme_void() + theme(plot.margin = margin(0, 0, 0, 55))
  
cowplot::plot_grid(title, ligne, get_legend(p1 + theme(legend.position = "bottom")), 
                   ncol = 1, rel_heights = c(0.1, 1, 0.2), align = "hv")

TE_RIP %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  filter(Cluster %in% c("V1", "V10", "V2", "V3", "V4", "V5", "V8", "V9")) %>%
  ggplot(aes(as.numeric(Total_insertions), as.numeric(Composite_median)))+
    theme_light()  +
    geom_point(aes(color = Cluster, shape = Cluster)) + 
    Color_Cluster3 + Shape_Cluster2 + 
    labs(color = "Genetic cluster",
         x = "TE insertion number", y = "RIP composite median",
         subtitle = "Without the OG Hartmann collection")  + 
    geom_smooth(color = "grey20", linetype = "dashed", method = "lm", se =F)+
   stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 1) +
   stat_regline_equation(label.y = 1.07)

TE_RIP %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  filter(Cluster == "V2") %>%
  ggplot(aes(as.numeric(Total_insertions), as.numeric(Composite_median)))+
    theme_light()  +
    geom_point(aes(color = Cluster, shape = Cluster)) + 
    Color_Cluster3 + Shape_Cluster2 + 
    labs(color = "Genetic cluster",
         x = "TE insertion number", y = "RIP composite median",
         subtitle = "Without the OG Hartmann collection")  + 
    geom_smooth(color = "grey20", linetype = "dashed", method = "lm", se =F)+
   stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 1) +
   stat_regline_equation(label.y = 1.07)

RIPped vs non-RIPped

If the RIP machinery is not functional in some populations, we would expect a bimodal distribution of RIP: older TEs would show RIP signatures, while recent insertions would be free or almost free from RIP signatures. The populations in which RIP is still active should instead only contain ripped TEs.

percent_low_rip = read_delim(paste0(TE_RIP_dir, "Summary_RIP_non_RIP_reads.txt"), delim = " ") %>%
  mutate(Percent = 100 * Non_ripped_count / (Non_ripped_count + Ripped_count)) %>%
  inner_join(TE_qty, by = c("Sample" = "ID_file")) %>%
  filter(Continent != "Asia" & !is.na(Continent)) %>%
  filter(!is.na(Cluster))


percent_low_rip %>%
    filter(Cluster %in% c("V11", "V6", "V7")) %>%
    filter(Collection != "Hartmann_FstQst_2015") %>% 
    dplyr::summarize(average = mean(Percent), min(Percent), max(Percent))
## # A tibble: 1 × 3
##   average `min(Percent)` `max(Percent)`
##     <dbl>          <dbl>          <dbl>
## 1    7.48           5.18           9.99
percent_low_rip %>%
    filter(Cluster %in% c("V1", "V2", "V3","V4", "V5","V8", "V9", "V10")) %>%
    filter(Collection != "Hartmann_FstQst_2015") %>%
    filter(Cluster != "Hybrid")%>% 
    dplyr::summarize(average = mean(Percent), min(Percent), max(Percent))
## # A tibble: 1 × 3
##   average `min(Percent)` `max(Percent)`
##     <dbl>          <dbl>          <dbl>
## 1    20.6           11.3           34.6
median_percent = percent_low_rip %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  filter(Cluster != "Hybrid")%>% 
  group_by(Cluster) %>%
  dplyr::summarise(Med_percent_rip = median(Percent)) %>%
  arrange(Med_percent_rip) 
  median_percent = median_percent %>%
  mutate(Cluster = factor(Cluster, levels = median_percent$Cluster)) 

percent_low_rip %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  filter(Cluster != "Hybrid") %>%
  mutate(Cluster = factor(Cluster, levels = median_percent$Cluster)) %>%
  ggplot(aes(reorder(Sample, Percent), Percent, fill = Cluster)) +
    geom_bar(stat = "identity") +
    Fill_Cluster3 + 
    theme_light()  +
    geom_hline(data = median_percent, aes(yintercept = Med_percent_rip)) +
    facet_grid(.~ Cluster, scales = "free_x", space = "free_x") +
    theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(),
          axis.text.x = element_blank(), axis.ticks.x = element_blank(),
          panel.spacing.x=unit(0, "lines"), 
          legend.position = "bottom") +
    labs(x = "Isolates sorted by cluster and percentage of non-ripped TE-reads", 
         y = "Percentage of non-ripped TE-reads", 
         title = "Without Hartmann collection")

#read_delim(paste0(TE_RIP_dir, "Summary_RIP_non_RIP_reads.txt"), delim = " ") %>%
#  mutate(Percent = 100 * Non_ripped_count / (Non_ripped_count + Ripped_count))  %>%
#  inner_join(TE_qty, by = c("Sample" = "ID_file")) %>%
#  dplyr::count(Cluster, Continent) %>%
#  pivot_wider(names_from = Continent, values_from = n, values_fill = 0)
percent_low_rip %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  ggplot(aes(x = Total_insertions, y = Percent, color = Continent)) + 
  geom_point(alpha = .4) + 
  geom_smooth(method = "lm") + 
  theme_light() +
  Color_Continent + Fill_Continent +
  facet_wrap(vars(Continent), scale = "free") +
   stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 35, size = 3) +
   stat_regline_equation(label.y = 38, aes(label = ..eq.label..), size = 3) + 
  labs(y = "Percentage of non-ripped TE reads",
       subtitle = "Without the Hartmann collection")

p1 = percent_low_rip %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  ggplot(aes(x = Total_insertions, y = Percent, color = Cluster, shape = Cluster)) + 
  geom_point(alpha = .9) + 
  theme_light() +
  Shape_Cluster2 + Color_Cluster3 +
  labs(y = "Percentage of non-ripped TE reads",title = "Without Hartmann collection")

p2 = percent_low_rip %>%
  ggplot(aes(x = Total_insertions, y = Percent, color = Cluster, shape = Cluster)) + 
  geom_point(alpha = .9) + 
  theme_light() +
  Shape_Cluster2 + Color_Cluster3 +
  labs(y = "Percentage of non-ripped TE reads",title = "All collections")

p1

p2

model1 = lm(Total_insertions ~  Collection + Cluster + Percent,
          data=percent_low_rip)
summary(model1)
## 
## Call:
## lm(formula = Total_insertions ~ Collection + Cluster + Percent, 
##     data = percent_low_rip)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -147.711  -20.572   -1.342   18.784  244.128 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      234.6468     8.5120  27.566  < 2e-16 ***
## CollectionETHZ_2020              105.1234     6.4423  16.318  < 2e-16 ***
## CollectionHartmann_FstQst_2015   -17.6457     7.1615  -2.464 0.013900 *  
## CollectionHartmann_Oregon_2016   179.9868     7.7116  23.340  < 2e-16 ***
## CollectionINRAE_BIOGER            70.6916     5.0982  13.866  < 2e-16 ***
## CollectionJGI                     61.8108     7.9987   7.728 2.56e-14 ***
## CollectionJGI_2                   44.8284     7.9198   5.660 1.95e-08 ***
## CollectionJGI_3                   61.9321     9.5411   6.491 1.31e-10 ***
## CollectionJGI_4                   92.0781    20.8784   4.410 1.14e-05 ***
## CollectionJGI_5                   76.8673     7.4087  10.375  < 2e-16 ***
## CollectionMMcDonald_2018         133.1208     9.9820  13.336  < 2e-16 ***
## CollectionSyngenta               208.3454     4.2842  48.631  < 2e-16 ***
## CollectionUnine_third_batch_2018 -15.5216    10.1339  -1.532 0.125912    
## ClusterV1                        -35.2236     9.5748  -3.679 0.000246 ***
## ClusterV10                        62.4303     7.4387   8.393  < 2e-16 ***
## ClusterV11                       -47.6819     7.8519  -6.073 1.76e-09 ***
## ClusterV2                         28.9798     5.3119   5.456 6.09e-08 ***
## ClusterV3                          1.0373    10.0067   0.104 0.917457    
## ClusterV4                         41.0599     9.9086   4.144 3.69e-05 ***
## ClusterV5                          1.6088     7.8200   0.206 0.837042    
## ClusterV6                        -33.0717     8.1793  -4.043 5.66e-05 ***
## ClusterV7                        -29.0594    10.3378  -2.811 0.005031 ** 
## ClusterV8                         51.8314    10.1741   5.094 4.15e-07 ***
## ClusterV9                          7.1309    10.6963   0.667 0.505126    
## Percent                            2.0034     0.3565   5.619 2.46e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.4 on 1049 degrees of freedom
##   (30 observations deleted due to missingness)
## Multiple R-squared:  0.8735, Adjusted R-squared:  0.8706 
## F-statistic: 301.7 on 24 and 1049 DF,  p-value: < 2.2e-16

With the Illumina data, we can look at each read but not necessarily at each copy. For this, I will look at the PacBio assemblies and the TE copies prediction and RIP estimation from Lorrain et al. 2021.

Lorrain_RIP = read_tsv(paste0(TE_RIP_dir, "Lorrain_MeanMedianCompositeValue_RIP_TEcopies.txt")) %>%
  full_join(read_delim(paste0(metadata_dir, "PacBio_metadata.csv"), 
                         delim = ","))

TE_nb = Lorrain_RIP %>% dplyr::count(Sample, name = "TE_count") %>% arrange(TE_count)
  
Lorrain_RIP %>%
  left_join(TE_nb) %>%
  filter(Species == "Z. tritici") %>%
  group_by(Sample) %>%
  dplyr::mutate(median_RIP = median(median)) %>%
  ggplot(aes(y = reorder(Sample, median_RIP), x = median)) +
  geom_vline(xintercept = 0, linetype = "dashed") + 
  geom_density_ridges(aes(fill = Continent), alpha = .7) + 
  theme_light() +
  coord_cartesian(xlim = c(-2.5, 5)) + 
  Fill_Continent +
  labs(x = "RIP composite per TE copy", y = "Isolate",
       title = "Distribution of RIP in TE copies of fully assembled genomes")

RIP per TE consensus / superfamily

It is known that different TE groups, in particular the MITEs, which are particularly small are less RIPped than other types of TEs. I wanted to check whether we saw such a pattern and so I visualize here the RIP per superfamily of TEs and then as related to the size of the consensus.

#Per  TE superfamily
RIP  %>%
  filter(Normalized_nb_reads_mapped > 0.0001) %>%
  group_by(Superfamily)%>%
  mutate(median_Superfamily=median(Composite_median, na.rm = T) )%>%
  ggplot(aes(x = Superfamily,
             y = as.numeric(Composite_median),
             fill = median_Superfamily)) +
    geom_boxplot(outlier.shape = NA) +
    theme_light() +
    ylab("Median of composite index on TE reads") +
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 40, hjust = 1)) +
  ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy")

temp =  RIP  %>%
  filter(Continent != "Asia") %>%
  filter(Normalized_nb_reads_mapped > 0.0001) %>%
  group_by(Superfamily, Continent)%>%
  dplyr::mutate(median_Superfamily=median(Composite_median, na.rm = T),
         for_alpha = median(Normalized_nb_reads_mapped))

p1 = filter(temp, Order == "Class I (retrotransposons)") %>%
  ggplot(aes(x = Superfamily,
             y = as.numeric(Composite_median),
             color = Continent, 
             fill = Continent, alpha = for_alpha)) +
    geom_boxplot(outlier.shape = NA) +
    theme_light() +
    labs(y ="RIP composite index", 
         title = "RIP on TE reads in the different continents") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy") +
  Color_Continent + Fill_Continent
p2 = filter(temp, Order == "Class II (DNA transposons)")%>%
  ggplot(aes(x = Superfamily,
             y = as.numeric(Composite_median),
             color = Continent,
             fill = Continent, alpha = for_alpha)) +
    geom_boxplot(outlier.shape = NA) +
    theme_light() +
    labs(y ="RIP composite index") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy") +
  Color_Continent+ Fill_Continent

cowplot::plot_grid(cowplot::plot_grid(p1 + theme(legend.position = "None", axis.title.x = element_blank()),
                   p2 + theme(legend.position = "None"), 
                   ncol = 1, rel_heights = c(1, 1)),
                   get_legend(p1), nrow = 1, rel_widths = c(1, 0.3))

In other species, RIP has been shown to have a size limit under which it is not capable of recognizing the repeated sequences, so I check here whether we recognize a similar pattern and whether some of the shortest TEs are escaping RIP.

#As relating to TE size
#NB: in the following plot, the alpha parameter is set to make the TE without any reads (or near so) invisible
#This means that not all consensus are visible. In particular, some, annotated  by Ursula as "verylowcopy" are not.
TE_consensus_faidx =read_delim(paste0(TE_RIP_dir, "Badet_BMC_Biology_2020_TE_consensus_sequences.fasta.fai"),
             col_names = c("TE",  "length",  "offset",  
             "number of bases per line",  "number of bytes per line"), delim = "\t")



p = inner_join(RIP, TE_consensus_faidx) %>%
  dplyr::group_by(Order, TE, length) %>%
  filter (TE != "RIP_est") %>%
  filter(!is.na(Normalized_nb_reads_mapped)) %>%
  dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
                   norm_read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
  ggplot(aes(x = length, y = median_per_consensus, color = Order)) +
  geom_point(aes( text = TE, alpha = norm_read_mapped))  + theme_light() +
  geom_hline(yintercept = 0) +
  labs(x = "TE consensus length (bp, log10 scale)",
       y = "Median of RIP composite index",
       title = str_wrap(paste("Median of the RIP composite index for each TE consensus",
                        "against the length of the consensus sequence"), width = 60)) +
  scale_x_continuous(trans = "log10") +
  scale_alpha_continuous(trans = "log10") 

ggsave("RIP_per_consensus.pdf", plot = p)
ggplotly(p)

Is dim2 present in its intact version in some populations?

Next step will be to check if dim2 is present where the RIP values suggest they are: intact copies in the Middle-East and Africa and absence/degeneration in the rest. Here, I use de novo genome assemblies and try to identify copies of dim2. For this, I use a deRIPped sequence of dim2 in the reference, blasted it on to Zt10 to get the sequence of Zt10_dim2 since it is known to have an intact sequence (see Mareike’s paper). I then compare this sequence (blastn) with de de novo assemblies to pull all the copies and identify the highest identity score.

#Here the Zt10_dim2_from_MgDNMT_deRIP.fa is the sequence of Zt10_6.417 which corresponds to the deRIPPed version of dim2 in the reference IPO323 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2940312/pdf/GEN186167.pdf


#sbatch -p normal.168h --array=1-1195%50 Detect_gene_blast_array.sh /home/alice/WW_PopGen/Directories_new.sh /data2/alice/WW_project/0_Data/Zt10_dim2_from_MgDNMT_deRIP.fa /home/alice/WW_PopGen/Keep_lists_samples/Good_assemblies.args Illumina /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/

Let’s look at the results as dot plots and compare the results from the dim2 blast to the RIP composite index. So far the Middle-Eastern samples seem quite similar to one another, whereas other regions contain way more variability such as Europe.

In order to identify the native dim2 copy in genomes, I look for several things:

  • the blast match has to be on the same contig as at least one of the two known flanking genes
  • the blast match has to be between the two flanking genes if both are on the same contig or less than 10 kb from the known flanking gene on the same contig
#system(paste0("cat ", DIM2_DIR, "*.blast.tab > ", DIM2_DIR, "Overall_results_blast_dim2.txt"))


length_dim2 = 3846
length_flank1 = 3689
length_flank2 = 1222
threshold_length_dim2 = 0.8 * length_dim2
threshold_length_flank1 = 0.8 * length_flank1
threshold_length_flank2 = 0.8 * length_flank2

dim2_blast_results_complete = read_delim(paste0(DIM2_DIR, "Overall_results_blast_dim2.txt"), delim = " ",
                                col_names = c("sample", "gene", "qseqid", "sseqid", "pident", "length",
                                              "mismatch", "gapopen", "qstart", "qend",
                                              "sstart", "send", "evalue", "bitscore")) 

#dim2_blast_results = 
                                         

flank1 = dim2_blast_results_complete %>%
  filter(gene == "dim2_flank1_Zt10_unitig_006_0418") %>%
  dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
  dplyr::mutate(midcoord_flank1 = (sstart + send)/2) %>%
  filter(length > threshold_length_flank1) %>%
  pivot_wider(names_from = gene, values_from = sseqid) %>%
  dplyr::select(-length, -sstart, -send) %>%
  group_by(sample) %>%
  dplyr::mutate(nb_gene = n(), 
                dim2_flank1 = dim2_flank1_Zt10_unitig_006_0418 )
  

flank2 = dim2_blast_results_complete %>%
  filter(gene == "dim2_flank2_Zt10_unitig_006_0416") %>%
  dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
  dplyr::mutate(midcoord_flank2 = (sstart + send)/2) %>%
  filter(length > threshold_length_flank2) %>%
  pivot_wider(names_from = gene, values_from = sseqid) %>%
  dplyr::select(-length, -sstart, -send) %>%
  group_by(sample) %>%
  dplyr::mutate(nb_gene = n(), 
                dim2_flank2 = dim2_flank2_Zt10_unitig_006_0416)

#Some dim2 copies have insertions and thus are identified as several matches
# I merge together the matches closer than 1kb
#grep "Zt10_dim2_from_MgDNMT_deRIP" Overall_results_blast_dim2.txt | awk 'BEGIN {OFS = "\t"} {if ($11 < $12) print $1":"$4, $11, $12, $5, $6; else print $1":"$4, $12, $11, $5, $6}' > Overall_results_blast_dim2.bed
#sort -k1,1 -k2,2n Overall_results_blast_dim2.bed > Overall_results_blast_dim2.sorted.bed
#bedtools merge -i Overall_results_blast_dim2.sorted.bed -c 4,5 -o collapse -d 1000 -delim ":" > /Overall_results_blast_dim2.sorted.merged_1kb.bed

#And recreate the pident as the mean of the pident of the matches weigthed by the length of the fragments
dim2_blast_results = read_delim(paste0(DIM2_DIR, "Overall_results_blast_dim2.sorted.merged_1kb.bed"),
                                         delim = "\t", 
                                         col_names = c("temp", "sstart", "send", "pident_temp", "length_temp")) %>%
  separate(col = temp, into = c("sample", "sseqid"), sep = ":") %>%
  separate_rows(pident_temp, length_temp, sep = ":", convert = TRUE) %>%
  group_by(sample, sseqid, sstart, send) %>%
  dplyr::summarize(pident = weighted.mean(pident_temp, length_temp),
            length = sum(length_temp)) %>%
  #Add the flanks information and find the middle coordinates for the 3 genes
  full_join(., flank1, by = "sample") %>%
  full_join(., flank2, by = "sample") %>%
  dplyr::mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
                dim2_flank2 = replace_na(dim2_flank2, "Not_found"),
                midcoord_flank1 = replace_na(midcoord_flank1, 0),
                midcoord_flank2 = replace_na(midcoord_flank2, 0)) %>%
  dplyr::mutate(midcoord_flank1 = as.numeric(midcoord_flank1),
                midcoord_flank2 = as.numeric(midcoord_flank2))  %>%
  dplyr::mutate(ave_coord = (sstart + send)/2)  %>%
  dplyr::mutate(min_fl = ifelse(midcoord_flank1 > midcoord_flank2,
                                midcoord_flank2, midcoord_flank1))  %>%
  dplyr::mutate(max_fl = ifelse(midcoord_flank1 > midcoord_flank2,
                                midcoord_flank1, midcoord_flank2)) %>%
  inner_join(Zt_meta, by = c("sample" = "ID_file")) %>%
  #Keep only the samples with either one or zero match for each flanking gene
  filter(nb_gene.x <= 1 & nb_gene.y <= 1)

#Now, let's compare the blast results of dim2 with the flanking genes
# (contig name and distance)
dim2_blast_results = dim2_blast_results %>%
  dplyr::mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
                                 "Both",
                                 ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
                                 "None",
                                 ifelse(sseqid == dim2_flank1, "Flank1",
                                 ifelse(sseqid == dim2_flank2, "Flank2", "What"))))) %>%
  dplyr::mutate(distance = ifelse(is_same_contig == "None", "No_distance",
                           ifelse(is_same_contig == "Both",
                                  ifelse(ave_coord >= min_fl &
                                         ave_coord <= max_fl ,
                                  "Distance_OK", "Not_between"),
                                  ifelse(is_same_contig == "Flank1",
                                         ifelse((abs(midcoord_flank1 - ave_coord) <= 10000),
                                  "Distance_OK", "Too_far"),
                                  ifelse((abs(midcoord_flank2 - ave_coord) <= 10000),
                                  "Distance_OK", "Too_far"))))) %>%
  mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0",
                                         ifelse(distance == "Distance_OK", ">1", "0")))



  #dplyr::select(sample, sseqid, length, dim2_flank1, dim2_flank2, is_same_contig) %>%
# Table percentages
temp = ungroup(dim2_blast_results) %>% dplyr::count(is_same_contig, name = "Nb_blast") %>%
  mutate(Percentage = round(100*Nb_blast/sum(Nb_blast), 1))
kable(temp)
is_same_contig Nb_blast Percentage
Both 387 3.4
Flank1 501 4.4
Flank2 306 2.7
None 10123 89.4
#The insertions in some copies are longer than 1kb and cause the native gene copy to be recognized as separate two blast matches. The stats don't make sense without taking this into consideration.
#temp = ungroup(dim2_blast_results) %>% dplyr::count(is_same_contig, sample, name = "Nb_blast") %>% pivot_wider(names_from = "is_same_contig", values_from = "Nb_blast")

#Pie chart and length density
nb_sample = length(unique(dim2_blast_results$sample))

p1 = dim2_blast_results %>%
  ggplot(aes(length, fill = is_same_contig, color = is_same_contig)) +
    geom_density(alpha = 0.5) +
    theme_light() +
    labs(x = "Length (bp)",
         y = "Density",
         title = str_wrap(paste0("Blast matches against Zt10dim2 for ", nb_sample, " samples"), 
                          width = 60),
         fill = "Neighbouring flanking genes",
         color = "Neighbouring flanking genes")+
  scale_color_manual(values =c("#ff9f1c","#1B998B", "#2ec4b6","#EDE7E3")) +
  scale_fill_manual(values =c("#ff9f1c","#1B998B", "#2ec4b6","#EDE7E3"))

##Pie
temp <- temp %>% 
  arrange(desc(is_same_contig)) %>%
  mutate(prop = Nb_blast / sum(temp$Nb_blast) *100) %>%
  mutate(ypos = cumsum(Nb_blast)- 0.5*Nb_blast )

p2 = temp %>%
  ggplot(aes(x = "")) +
  geom_bar(aes(fill = is_same_contig, y = Nb_blast), stat ="identity", width = 1) +
  coord_polar("y", start = 0)+ 
  theme_void()+
  scale_color_manual(values =c("#ff9f1c","#1B998B", "#2ec4b6","#EDE7E3")) +
  scale_fill_manual(values =c("#ff9f1c","#1B998B", "#2ec4b6","#EDE7E3")) +
  geom_text_repel(aes(y = ypos, label = paste0(Percentage, "%")), color = "black", size=4)



ggdraw(p1) +
  draw_plot(p2 + theme(legend.position = "None"), x = 0.08, y = 0.05, width = 0.25, height =  1.3) 

Now, I will select only the copies which have a least one flanking gene found on the same contig. I consider these the “native” copy of the gene.

Here a large proportion of the Middle-Eastern samples from the old Hartmann dataset have two matches on the “right” contigs: this is due to a deletion found in one of the allele of dim2 (shown in Mareike’s new version of her dim2 manuscript).

Let’s investigate quickly how many long copies there are in each genome. I’m also interested in the native match as related to RIP and geography.

temp = dim2_blast_results %>%
  group_by(sample) %>%
  dplyr::summarize(n_matches = n(),
                   n_long_matches = sum(length > 1000))

sum_dim2_blast = dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  group_by(sample) %>%
  dplyr::summarize(length_sum = sum(length),
                   pident_wm = weighted.mean(pident, length),
                   n_matches_on_native_contigs = n()) %>%
  filter(length_sum < length_dim2 + 200) %>%
  inner_join(., temp) %>%
  inner_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file"))


sum_dim2_blast %>%
  #filter(Collection == "Hartmann_FstQst_2015") %>%
  ggplot(aes(as.numeric(Composite_median), pident_wm, color = Cluster, shape = Cluster))+
  geom_point() +
  Shape_Cluster2 + Color_Cluster3 +
  theme_light() + theme(legend.position = "none") +
    labs(x = str_wrap("RIP composite index (median per isolate)",
                      width = 40),
         y = str_wrap("Identity of the native match", width = 40),
         title = "Identity with functional dim2 and RIP composite index of TEs",
         subtitle = "All collections")

sum_dim2_blast %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  ggplot(aes(as.numeric(Composite_median), pident_wm, color = Cluster, shape = Cluster))+
  geom_point() + 
  Shape_Cluster2 + Color_Cluster3 +
  theme_light() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 40),
         y = str_wrap("Identity of the native match",
                      width = 40),
         title = "Identity with functional dim2 and RIP composite index of TEs",
         subtitle = "Without Hartmann collection")

And then as boxplots per continental region.

sum_dim2_blast %>%
  filter(!is.na(Cluster)) %>%
  group_by(Cluster) %>%
  dplyr::mutate(avg_per_cont = mean(pident_wm, na.rm = T)) %>%
  ggplot(aes(reorder(Cluster, avg_per_cont), pident_wm)) +
    geom_violin(aes(color = Cluster, fill = Cluster), alpha = .4) + 
    geom_boxplot(width = .1, aes(color = Cluster)) +
   # geom_jitter(aes(col = Cluster), size = 0.8, alpha = 0.6, width = 0.2, height = 0.1) +
    Fill_Cluster3 + Color_Cluster3  +
    theme_light() +
    theme(legend.position = "none",
    axis.text.x = element_text(size = 10, angle = 40, hjust = 1)) +
    labs(x = NULL, y = "Identity of the native copy to Zt10dim2")

sum_dim2_blast %>%
  filter(!is.na(Cluster))  %>%
  group_by(Cluster) %>%
  dplyr::mutate(avg_per_clust = mean(n_long_matches, na.rm = T)) %>%
  ggplot(aes(x = reorder(Cluster, avg_per_clust), y = n_long_matches)) +
    geom_violin(aes(color = Cluster, fill = Cluster), alpha = .4) + 
    geom_boxplot(width = .1, aes(color = Cluster)) + 
    Color_Cluster3 + Fill_Cluster3 + theme_light() +
    labs(x = str_wrap("Cluster",width = 20),
         y = str_wrap("Number of long blast matches (above 1kb)",
                      width = 50),
         color = "Genetic cluster") +
    theme(axis.title=element_text(size=10))

Gene duplication and RIP

If there is a relaxation of RIP, we could expect that TEs would not be the only things impacted but that gene duplications would also be possible more in RIP-relaxed genomes than in RIP-active.

Badet_pan_table = read_tsv(paste0(TE_RIP_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt")) %>%
  mutate(isolate = case_when(isolate == "1A5" ~ "Zt1A5",
                             isolate == "1E4" ~ "Zt1E4",
                             isolate == "3D1" ~ "Zt3D1",
                             isolate == "3D7" ~ "Zt3D7",
                             TRUE ~ isolate))
Lorrain_RIP = read_tsv(paste0(TE_RIP_dir, "Lorrain_MeanMedianTECompositeValue_RIP_perStrain.txt")) %>%
  full_join(read_delim(paste0(metadata_dir, "PacBio_metadata.csv"), 
                         delim = ","), by = c("Strain" = "Sample"))
  

temp = Badet_pan_table %>%
  dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
  group_by(isolate) %>%
  dplyr::mutate(nb_genes = n()) %>%
  dplyr::filter(N_genes >= 2) %>%
  dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
  group_by(isolate, N_genes_cat) %>%
  dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
  dplyr::mutate(Percent = 100 * count / nb_genes) 


inner_join(temp, Lorrain_RIP, by = c("isolate" = "Strain")) %>%
  filter(N_genes_cat != "9 +") %>%
  group_by(Composite_median, isolate, Continent) %>%
  dplyr::summarize(count = sum(count)) %>%
  ggplot(aes(x = count, y = Composite_median, label = isolate, color = Continent)) + 
  geom_text() + 
  theme_light() + Color_Continent +
  labs(x = "Number of genes found in several copies", y = "RIP composite median on TE",
       title = "Copy number between 2 and 9")

inner_join(temp, Lorrain_RIP, by = c("isolate" = "Strain")) %>%
  filter(N_genes_cat == "2") %>%
  group_by(Composite_median, isolate, Continent) %>%
  dplyr::summarize(count = sum(count)) %>%
  ggplot(aes(x = count, y = Composite_median, label = isolate, color = Continent)) + 
  geom_text() + 
  theme_light() + Color_Continent +
  labs(x = "Number of genes found with two copies", y = "RIP composite median on TE")